ODIN Claude Sonnet 4.6 commited on
Commit ·
67e93c9
0
Parent(s):
Initial commit: ODIN multi-agent drilling intelligence system
Browse filesCrewAI + Gemini-powered agent system for the SPE GCS 2026 ML Challenge.
Analyzes Volve field drilling data (WITSML, DDR, EDM) via a Gradio chat UI.
Runtime data downloaded separately via scripts/download_data.py (HuggingFace).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- .env.example +7 -0
- .gitignore +87 -0
- README.md +182 -0
- challenge_reqs.txt +182 -0
- problem_statement.txt +160 -0
- promptfooconfig.yaml +41 -0
- requirements.txt +32 -0
- scripts/download_data.py +42 -0
- scripts/upload_data.py +64 -0
- src/__init__.py +0 -0
- src/agents/__init__.py +0 -0
- src/agents/answer_challenge.py +42 -0
- src/agents/app.py +1073 -0
- src/agents/crew.py +532 -0
- src/agents/data_tools.py +1141 -0
- src/agents/orchestrator.py +191 -0
- src/agents/promptfoo_provider.py +42 -0
- src/agents/tools.py +263 -0
- src/data_pipeline/__init__.py +0 -0
- src/data_pipeline/parse_ddr_xml.py +239 -0
- src/data_pipeline/parse_edm.py +118 -0
- src/data_pipeline/parse_witsml_logs.py +259 -0
- src/data_pipeline/run_pipeline.py +129 -0
- src/data_pipeline/utils.py +57 -0
- src/rag/__init__.py +0 -0
- src/rag/build_openviking_db.py +135 -0
- src/rag/build_vector_db.py +84 -0
- src/rag/build_volve_db.py +91 -0
- src/rag/count_chunks.py +13 -0
- src/rag/scrape_knowledge.py +208 -0
- src/rag/test_openviking.py +26 -0
- src/rag/test_retrieval.py +61 -0
- tests/prompts/analyst_prompt.txt +55 -0
- tests/prompts/auditor_prompt.txt +51 -0
- tests/prompts/historian_prompt.txt +38 -0
- tests/prompts/lead_prompt.txt +72 -0
.env.example
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copy this file to .env and fill in your values.
|
| 2 |
+
# NEVER commit .env to git.
|
| 3 |
+
|
| 4 |
+
# ── Google Gemini API ─────────────────────────────────────────────────────────
|
| 5 |
+
# Get your key at: https://aistudio.google.com/app/apikey
|
| 6 |
+
# Free tier: 15 RPM / 250K TPM / 500 RPD (Gemini 2.5 Flash)
|
| 7 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
.gitignore
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Environment & Secrets ─────────────────────────────────────────────────────
|
| 2 |
+
.env
|
| 3 |
+
.env.*
|
| 4 |
+
!.env.example
|
| 5 |
+
|
| 6 |
+
# ── Python ────────────────────────────────────────────────────────────────────
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
venv/
|
| 12 |
+
.venv/
|
| 13 |
+
env/
|
| 14 |
+
ENV/
|
| 15 |
+
*.egg-info/
|
| 16 |
+
dist/
|
| 17 |
+
build/
|
| 18 |
+
.eggs/
|
| 19 |
+
|
| 20 |
+
# ── Data (Volve dataset — large, license-restricted) ──────────────────────────
|
| 21 |
+
data/
|
| 22 |
+
src/data/
|
| 23 |
+
|
| 24 |
+
# ── Outputs (generated charts & reports) ──────────────────────────────────────
|
| 25 |
+
outputs/
|
| 26 |
+
|
| 27 |
+
# ── Jupyter ───────────────────────────────────────────────────────────────────
|
| 28 |
+
.ipynb_checkpoints/
|
| 29 |
+
*.ipynb
|
| 30 |
+
|
| 31 |
+
# ── IDE & OS ──────────────────────────────────────────────────────────────────
|
| 32 |
+
.vscode/
|
| 33 |
+
.idea/
|
| 34 |
+
*.swp
|
| 35 |
+
*.swo
|
| 36 |
+
.DS_Store
|
| 37 |
+
Thumbs.db
|
| 38 |
+
*Zone.Identifier
|
| 39 |
+
|
| 40 |
+
# ── Home-directory dotfiles (repo root = $HOME) ───────────────────────────────
|
| 41 |
+
.bash_history
|
| 42 |
+
.bash_logout
|
| 43 |
+
.bashrc
|
| 44 |
+
.profile
|
| 45 |
+
.motd_shown
|
| 46 |
+
.sudo_as_admin_successful
|
| 47 |
+
.bash_aliases
|
| 48 |
+
.bash_profile
|
| 49 |
+
.cache/
|
| 50 |
+
.local/
|
| 51 |
+
.npm/
|
| 52 |
+
.pki/
|
| 53 |
+
.nv/
|
| 54 |
+
.landscape/
|
| 55 |
+
.config/
|
| 56 |
+
.claude/
|
| 57 |
+
.promptfoo/
|
| 58 |
+
|
| 59 |
+
# ── Logs & Temp files ─────────────────────────────────────────────────────────
|
| 60 |
+
*.log
|
| 61 |
+
*.tmp
|
| 62 |
+
*.txt.bak
|
| 63 |
+
test_logs.txt
|
| 64 |
+
test_output.txt
|
| 65 |
+
verify_*.txt
|
| 66 |
+
verify_*.md
|
| 67 |
+
flowpath.csv
|
| 68 |
+
volve_temp_unzipped/
|
| 69 |
+
|
| 70 |
+
# ── Root-level dev/debug scripts (not part of the application) ────────────────
|
| 71 |
+
/test_*.py
|
| 72 |
+
/debug_*.py
|
| 73 |
+
/check_*.py
|
| 74 |
+
/create_and_run_notebook.py
|
| 75 |
+
/extract_*.py
|
| 76 |
+
/parse_picks.py
|
| 77 |
+
/read_pdf*.py
|
| 78 |
+
/search_hf.py
|
| 79 |
+
/verify_*.py
|
| 80 |
+
/google-api-models.py
|
| 81 |
+
|
| 82 |
+
# ── Generated output docs ─────────────────────────────────────────────────────
|
| 83 |
+
challenge_output.md
|
| 84 |
+
dashboard_test_output.md
|
| 85 |
+
|
| 86 |
+
# ── PDFs in root (large binaries) ─────────────────────────────────────────────
|
| 87 |
+
/*.pdf
|
README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ODIN — Operational Drilling Intelligence Network
|
| 2 |
+
|
| 3 |
+
> Multi-agent AI system for subsurface and drilling engineering analysis
|
| 4 |
+
> Built on the public Equinor Volve Field dataset · SPE GCS 2026 ML Challenge
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Overview
|
| 9 |
+
|
| 10 |
+
ODIN is a CrewAI-powered multi-agent system that answers complex drilling engineering questions by reasoning over structured data (WITSML, EDM) and unstructured reports (Daily Drilling Reports). It combines real-time data retrieval, RAG over domain knowledge, and a Gradio chat interface with inline Plotly visualizations.
|
| 11 |
+
|
| 12 |
+
**Key capabilities:**
|
| 13 |
+
- Drill phase distribution & NPT breakdown analysis
|
| 14 |
+
- ROP / WOB / RPM performance profiling
|
| 15 |
+
- Cross-well KPI comparison
|
| 16 |
+
- BHA configuration review and handover summaries
|
| 17 |
+
- Stuck-pipe and wellbore stability root-cause analysis
|
| 18 |
+
- Evidence-cited answers with confidence levels
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Architecture
|
| 23 |
+
|
| 24 |
+
```
|
| 25 |
+
User Query
|
| 26 |
+
│
|
| 27 |
+
▼
|
| 28 |
+
Orchestrator (orchestrator.py)
|
| 29 |
+
│ Classifies query → lean or full crew
|
| 30 |
+
│
|
| 31 |
+
├── LEAN (chart / compare queries, ~40s)
|
| 32 |
+
│ Analyst ──► Lead (Odin)
|
| 33 |
+
│
|
| 34 |
+
└── FULL (deep analysis, ~80s)
|
| 35 |
+
Lead ──► Analyst ──► Historian ──► Lead (Odin)
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
**Agents:**
|
| 39 |
+
| Agent | Role |
|
| 40 |
+
|---|---|
|
| 41 |
+
| **Odin (Lead)** | Synthesizes findings, grounds in Volve KB |
|
| 42 |
+
| **Data Analyst** | Runs DDR / WITSML / EDM queries & Python charts |
|
| 43 |
+
| **Historian** | Searches operational history, validates stats |
|
| 44 |
+
|
| 45 |
+
**Tools available to agents:**
|
| 46 |
+
- `DDR_Query` — Daily Drilling Report search
|
| 47 |
+
- `WITSML_Analyst` — Realtime drilling log analysis
|
| 48 |
+
- `EDM_Technical_Query` — Casing, BHA, formation data
|
| 49 |
+
- `CrossWell_Comparison` — Multi-well KPI comparison
|
| 50 |
+
- `VolveHistory_SearchTool` — RAG over Volve campaign history
|
| 51 |
+
- `python_interpreter` — Pandas + Plotly for custom charts
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Tech Stack
|
| 56 |
+
|
| 57 |
+
| Layer | Technology |
|
| 58 |
+
|---|---|
|
| 59 |
+
| LLM | Google Gemini 2.5 Flash (via `google-generativeai`) |
|
| 60 |
+
| Agent framework | CrewAI 1.10 |
|
| 61 |
+
| RAG / Vector store | ChromaDB + `sentence-transformers` |
|
| 62 |
+
| Data processing | Pandas, NumPy, PDFPlumber |
|
| 63 |
+
| Visualisation | Plotly (HTML) + Kaleido (PNG) |
|
| 64 |
+
| UI | Gradio 6 |
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## Data
|
| 69 |
+
|
| 70 |
+
This project uses the **Equinor Volve Field open dataset** (released under the Volve Data Sharing Agreement).
|
| 71 |
+
|
| 72 |
+
> Download from: [https://www.equinor.com/energy/volve-data-sharing](https://www.equinor.com/energy/volve-data-sharing)
|
| 73 |
+
|
| 74 |
+
After downloading, extract to `data/raw/` and run the ETL pipeline:
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
python src/data_pipeline/run_pipeline.py
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
Then build the knowledge base:
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
python src/rag/build_volve_db.py
|
| 84 |
+
python src/rag/build_openviking_db.py
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
---
|
| 88 |
+
|
| 89 |
+
## Quickstart (judges)
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
# 1. Clone & install
|
| 93 |
+
git clone <repo-url>
|
| 94 |
+
cd odin
|
| 95 |
+
python -m venv venv
|
| 96 |
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
| 97 |
+
pip install -r requirements.txt
|
| 98 |
+
|
| 99 |
+
# 2. Download runtime data (~400 MB knowledge bases + processed CSVs)
|
| 100 |
+
python scripts/download_data.py
|
| 101 |
+
|
| 102 |
+
# 3. Add your Gemini API key
|
| 103 |
+
cp .env.example .env
|
| 104 |
+
# Edit .env: set GOOGLE_API_KEY=<your key>
|
| 105 |
+
# Free key at: https://aistudio.google.com/app/apikey
|
| 106 |
+
|
| 107 |
+
# 4. Run
|
| 108 |
+
python src/agents/app.py
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
Open `http://localhost:7860` in your browser.
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
## Project Structure
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
odin/
|
| 119 |
+
├── src/
|
| 120 |
+
│ ├── agents/ # Main application
|
| 121 |
+
│ │ ├── app.py # Gradio UI (entry point)
|
| 122 |
+
│ │ ├── orchestrator.py # Query routing & streaming
|
| 123 |
+
│ │ ├── crew.py # CrewAI agent definitions & tasks
|
| 124 |
+
│ │ ├── tools.py # DDR / WITSML / EDM / RAG tools
|
| 125 |
+
│ │ └── data_tools.py # Python interpreter tool + data helpers
|
| 126 |
+
│ │
|
| 127 |
+
│ ├── data_pipeline/ # ETL: raw Volve data → processed CSV
|
| 128 |
+
│ │ ├── run_pipeline.py
|
| 129 |
+
│ │ ├── parse_witsml_logs.py
|
| 130 |
+
│ │ ├── parse_ddr_xml.py
|
| 131 |
+
│ │ └── parse_edm.py
|
| 132 |
+
│ │
|
| 133 |
+
│ └── rag/ # Knowledge base builders
|
| 134 |
+
│ ├── build_volve_db.py
|
| 135 |
+
│ └── build_openviking_db.py
|
| 136 |
+
│
|
| 137 |
+
├── tests/
|
| 138 |
+
│ └── prompts/ # Agent prompt test cases
|
| 139 |
+
│
|
| 140 |
+
├── data/ # ← NOT in git (download separately)
|
| 141 |
+
│ ├── raw/ # Original Volve dataset
|
| 142 |
+
│ ├── processed/ # ETL output (CSV / Parquet)
|
| 143 |
+
│ └── knowledge_base/ # ChromaDB vector stores
|
| 144 |
+
│
|
| 145 |
+
├── outputs/ # ← NOT in git (generated at runtime)
|
| 146 |
+
│ └── figures/ # Plotly charts (HTML + PNG)
|
| 147 |
+
│
|
| 148 |
+
├── requirements.txt
|
| 149 |
+
├── .env.example
|
| 150 |
+
└── promptfooconfig.yaml # Evaluation harness (PromptFoo)
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## Rate Limits
|
| 156 |
+
|
| 157 |
+
The system is tuned for the Gemini free tier (15 RPM):
|
| 158 |
+
|
| 159 |
+
| Crew mode | LLM calls | Target time |
|
| 160 |
+
|---|---|---|
|
| 161 |
+
| Lean (chart / compare) | ~6 calls | ~40s |
|
| 162 |
+
| Full (deep analysis) | ~10 calls | ~80s |
|
| 163 |
+
|
| 164 |
+
Automatic 429 retry with exponential back-off (10 → 20 → 40 → 60s) is built in.
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## Evaluation
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
# Run PromptFoo evaluation suite
|
| 172 |
+
npx promptfoo eval
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
Config: `promptfooconfig.yaml`
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
## License
|
| 180 |
+
|
| 181 |
+
Source code: MIT
|
| 182 |
+
Volve dataset: [Volve Data Sharing Agreement](https://www.equinor.com/energy/volve-data-sharing) (not included in this repo)
|
challenge_reqs.txt
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\n--- PAGE 1 ---\n
|
| 2 |
+
1
|
| 3 |
+
|
| 4 |
+
SPE GCS 2026 ML Challenge - Building
|
| 5 |
+
an Agentic AI System for Operational
|
| 6 |
+
Intelligence
|
| 7 |
+
Introduction
|
| 8 |
+
Drilling a well for oil and gas is a complex engineering activity. During drilling, large amounts of
|
| 9 |
+
data are generated. This includes numerical measurements such as depth and rate of
|
| 10 |
+
penetration, as well as written daily reports prepared by engineers at the rig site.
|
| 11 |
+
|
| 12 |
+
Engineers must combine these different types of information to understand what is happening,
|
| 13 |
+
detect problems, evaluate performance, and decide what actions to take next.
|
| 14 |
+
|
| 15 |
+
In this challenge, your task is to build an intelligent AI agent that can read drilling data and
|
| 16 |
+
reports, reason about them, and answer operational questions in a clear and evidence based
|
| 17 |
+
way.
|
| 18 |
+
|
| 19 |
+
The goal is not only to predict values. The goal is to explain what happened, why it happened,
|
| 20 |
+
and what are the potential next steps.
|
| 21 |
+
Aim of the Challenge
|
| 22 |
+
The aim of this challenge is to design an AI system that can combine structured data, written
|
| 23 |
+
reports, and domain knowledge to generate operational insights.
|
| 24 |
+
|
| 25 |
+
Your system should be able to:
|
| 26 |
+
• Understand drilling operations
|
| 27 |
+
• Identify drilling phases and activities
|
| 28 |
+
• Analyze performance and efficiency
|
| 29 |
+
• Evaluate drilling configurations
|
| 30 |
+
• Explain operational issues
|
| 31 |
+
• Provide decision support
|
| 32 |
+
The focus is on reasoning, clarity, and evidence based conclusions. \n--- PAGE 2 ---\n
|
| 33 |
+
2
|
| 34 |
+
|
| 35 |
+
Data That Will Be Provided
|
| 36 |
+
Participants will receive extracted data from the public Equinor Volve Field dataset through a
|
| 37 |
+
shared repository.
|
| 38 |
+
|
| 39 |
+
The provided data will include:
|
| 40 |
+
|
| 41 |
+
1. Well metadata
|
| 42 |
+
This includes basic information about wells such as well name, sections drilled, and
|
| 43 |
+
configuration information.
|
| 44 |
+
|
| 45 |
+
2. Drilling data samples
|
| 46 |
+
This includes structured time based or depth based measurements such as:
|
| 47 |
+
• Depth
|
| 48 |
+
• Rate of penetration
|
| 49 |
+
• Rotation speed
|
| 50 |
+
• Torque
|
| 51 |
+
• Pump pressure
|
| 52 |
+
• Flow rate
|
| 53 |
+
• Hookload or weight on bit
|
| 54 |
+
3. Daily drilling reports
|
| 55 |
+
These are written reports prepared by engineers. They describe what activities were performed
|
| 56 |
+
during the day, what problems occurred, and what actions were taken.
|
| 57 |
+
|
| 58 |
+
4. Volve documentation
|
| 59 |
+
This includes supporting documents that explain the dataset and provide background
|
| 60 |
+
information.
|
| 61 |
+
|
| 62 |
+
The data will be provided in raw form. There will be no predefined drilling phase labels, no event
|
| 63 |
+
tags, and no performance ratings. Participants must interpret and structure the data
|
| 64 |
+
themselves.
|
| 65 |
+
Open Knowledge Sources You May Use
|
| 66 |
+
Participants are encouraged to use publicly available reference material as a knowledge base.
|
| 67 |
+
This material is not curated or simplified. It must be retrieved and interpreted by your system. \n--- PAGE 3 ---\n
|
| 68 |
+
3
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
Examples of public knowledge sources include:
|
| 72 |
+
• Schlumberger Oilfield Glossary
|
| 73 |
+
This explains drilling terminology such as rate of penetration, tripping, circulation, and
|
| 74 |
+
non-productive time.
|
| 75 |
+
• SPE PetroWiki
|
| 76 |
+
This contains articles explaining drilling concepts, tools, and operational practices.
|
| 77 |
+
• IADC drilling terminology documents
|
| 78 |
+
These explain standard drilling acronyms and definitions.
|
| 79 |
+
• General engineering references related to drilling and well construction.
|
| 80 |
+
You may use these sources to help your system understand domain terms and concepts.
|
| 81 |
+
What Your System Must Do
|
| 82 |
+
Your system should function as an intelligent agent. It should be able to answer operational
|
| 83 |
+
questions using both numerical data and written reports.
|
| 84 |
+
|
| 85 |
+
The types of questions will cover multiple levels of reasoning.
|
| 86 |
+
Drilling Phase Identification & Validation
|
| 87 |
+
• Identify and label the major drilling phases for <Well Name> over the selected interval,
|
| 88 |
+
including the evidence used for each phase.
|
| 89 |
+
• Detect significant operational or phase transitions, noting when they occurred and why
|
| 90 |
+
they matter.
|
| 91 |
+
• Assess how well the inferred drilling phases align with the daily drilling reports.
|
| 92 |
+
• Identify periods where the operational state is ambiguous and explain the sources of
|
| 93 |
+
uncertainty.
|
| 94 |
+
Time & Efficiency Analysis
|
| 95 |
+
• Distinguish between productive and non-productive drilling time, and justify the criteria
|
| 96 |
+
used.
|
| 97 |
+
• Define drilling efficiency for <Well Name> and evaluate how it changes over time.
|
| 98 |
+
• Compare overall drilling efficiency between <Well Name> and at least one other well.
|
| 99 |
+
• Evaluate whether higher drilling speed was associated with stable operations or
|
| 100 |
+
increased operational risk. \n--- PAGE 4 ---\n
|
| 101 |
+
4
|
| 102 |
+
|
| 103 |
+
Section & ROP Performance
|
| 104 |
+
• Determine which hole section appears easiest to drill and which appears most
|
| 105 |
+
challenging, with supporting evidence.
|
| 106 |
+
• Analyze how rate of penetration varies across sections and describe notable trends.
|
| 107 |
+
• Identify periods of exceptional drilling performance and explain why they stand out.
|
| 108 |
+
Configuration & BHA Effectiveness
|
| 109 |
+
• Identify the most effective drilling configuration or BHA run and explain the context.
|
| 110 |
+
• Assess whether changes in configuration coincide with changes in performance.
|
| 111 |
+
• Evaluate configuration effectiveness by hole section.
|
| 112 |
+
• Identify configurations that appear robust across operating conditions, as well as those
|
| 113 |
+
that underperformed and potential reasons why.
|
| 114 |
+
• Assess how daily drilling reports support or contradict conclusions about configuration
|
| 115 |
+
effectiveness.
|
| 116 |
+
Operational Issues & Root Causes
|
| 117 |
+
• Identify key operational issues encountered while drilling <Well Name>.
|
| 118 |
+
• Propose likely contributing factors or root causes.
|
| 119 |
+
• Analyze whether these issues persisted, resolved, or recurred over time.
|
| 120 |
+
• Highlight areas where drilling data and daily reports provide conflicting interpretations.
|
| 121 |
+
Synthesis & Recommendations
|
| 122 |
+
• Compare the drilling phase distribution of <Well Name> with another well <Well
|
| 123 |
+
Name1> and explain key differences.
|
| 124 |
+
• Describe remaining uncertainties in the analysis and their potential impact.
|
| 125 |
+
• Determine which operational team(s) should be notified based on the findings, and why.
|
| 126 |
+
• Produce a concise operational handover summary for the next shift.
|
| 127 |
+
• Extract key lessons learned that could apply to future wells.
|
| 128 |
+
• Based on observed trends, describe expected performance in a similar section of
|
| 129 |
+
another well.
|
| 130 |
+
• Recommend a drilling configuration for similar conditions.
|
| 131 |
+
• Identify what additional data would most improve confidence in the conclusions.
|
| 132 |
+
Expected Output Format
|
| 133 |
+
For each question, your system should provide: \n--- PAGE 5 ---\n
|
| 134 |
+
5
|
| 135 |
+
|
| 136 |
+
• A clear answer
|
| 137 |
+
• Evidence from drilling data
|
| 138 |
+
• Evidence from daily reports
|
| 139 |
+
• Explanation of reasoning
|
| 140 |
+
• Statement of assumptions
|
| 141 |
+
• Confidence level or uncertainty
|
| 142 |
+
Answers should be understandable to an engineer reviewing your work.
|
| 143 |
+
Design Criteria
|
| 144 |
+
You may use:
|
| 145 |
+
• Open source libraries
|
| 146 |
+
• Local language models
|
| 147 |
+
• Free tier cloud models
|
| 148 |
+
• Statistical analysis methods
|
| 149 |
+
• Machine learning models
|
| 150 |
+
• Retrieval augmented generation systems
|
| 151 |
+
• Tool based agents
|
| 152 |
+
You are not required to use any proprietary software.
|
| 153 |
+
|
| 154 |
+
Your system design should prioritize:
|
| 155 |
+
• Transparency
|
| 156 |
+
• Traceability of evidence
|
| 157 |
+
• Clear reasoning
|
| 158 |
+
• Reproducibility
|
| 159 |
+
Complexity alone will not be rewarded.
|
| 160 |
+
Evaluation Criteria
|
| 161 |
+
Evaluation will be based on a structured question set.
|
| 162 |
+
Solutions will be assessed based on:
|
| 163 |
+
• Quality of reasoning
|
| 164 |
+
• Correct and relevant use of evidence \n--- PAGE 6 ---\n
|
| 165 |
+
6
|
| 166 |
+
|
| 167 |
+
• Consistency across answers
|
| 168 |
+
• Clarity of assumptions
|
| 169 |
+
• Handling of uncertainty
|
| 170 |
+
• Practical relevance of insights
|
| 171 |
+
There is no single correct answer for the questions. Different approaches are acceptable if they
|
| 172 |
+
are well justified and supported by evidence.
|
| 173 |
+
|
| 174 |
+
The evaluation emphasizes reasoning quality rather than matching a specific numeric answer.
|
| 175 |
+
Summary
|
| 176 |
+
This challenge asks you to build more than a predictive model. It asks you to design an AI system
|
| 177 |
+
that can read data, understand context, reason through engineering problems, and
|
| 178 |
+
communicate conclusions clearly.
|
| 179 |
+
|
| 180 |
+
The objective is to explore how intelligent systems can assist real world operational decision
|
| 181 |
+
making using raw data and public domain knowledge.
|
| 182 |
+
|
problem_statement.txt
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SPE GCS 2026 ML Challenge - Building
|
| 2 |
+
an Agentic AI System for Operational
|
| 3 |
+
Intelligence
|
| 4 |
+
Introduction
|
| 5 |
+
Drilling a well for oil and gas is a complex engineering activity. During drilling, large amounts of
|
| 6 |
+
data are generated. This includes numerical measurements such as depth and rate of
|
| 7 |
+
penetration, as well as written daily reports prepared by engineers at the rig site.
|
| 8 |
+
Engineers must combine these different types of information to understand what is happening,
|
| 9 |
+
detect problems, evaluate performance, and decide what actions to take next.
|
| 10 |
+
In this challenge, your task is to build an intelligent AI agent that can read drilling data and
|
| 11 |
+
reports, reason about them, and answer operational questions in a clear and evidence based
|
| 12 |
+
way.
|
| 13 |
+
The goal is not only to predict values. The goal is to explain what happened, why it happened,
|
| 14 |
+
and what are the potential next steps.
|
| 15 |
+
Aim of the Challenge
|
| 16 |
+
The aim of this challenge is to design an AI system that can combine structured data, written
|
| 17 |
+
reports, and domain knowledge to generate operational insights.
|
| 18 |
+
Your system should be able to:
|
| 19 |
+
• Understand drilling operations
|
| 20 |
+
• Identify drilling phases and activities
|
| 21 |
+
• Analyze performance and efficiency
|
| 22 |
+
• Evaluate drilling configurations
|
| 23 |
+
• Explain operational issues
|
| 24 |
+
• Provide decision support
|
| 25 |
+
The focus is on reasoning, clarity, and evidence based conclusions.
|
| 26 |
+
1
|
| 27 |
+
Data That Will Be Provided
|
| 28 |
+
Participants will receive extracted data from the public Equinor Volve Field dataset through a
|
| 29 |
+
shared repository.
|
| 30 |
+
The provided data will include:
|
| 31 |
+
1. Well metadata
|
| 32 |
+
This includes basic information about wells such as well name, sections drilled, and
|
| 33 |
+
configuration information.
|
| 34 |
+
2. Drilling data samples
|
| 35 |
+
This includes structured time based or depth based measurements such as:
|
| 36 |
+
• Depth
|
| 37 |
+
• Rate of penetration
|
| 38 |
+
• Rotation speed
|
| 39 |
+
• Torque
|
| 40 |
+
• Pump pressure
|
| 41 |
+
• Flow rate
|
| 42 |
+
• Hookload or weight on bit
|
| 43 |
+
3. Daily drilling reports
|
| 44 |
+
These are written reports prepared by engineers. They describe what activities were performed
|
| 45 |
+
during the day, what problems occurred, and what actions were taken.
|
| 46 |
+
4. Volve documentation
|
| 47 |
+
This includes supporting documents that explain the dataset and provide background
|
| 48 |
+
information.
|
| 49 |
+
The data will be provided in raw form. There will be no predefined drilling phase labels, no event
|
| 50 |
+
tags, and no performance ratings. Participants must interpret and structure the data
|
| 51 |
+
themselves.
|
| 52 |
+
Open Knowledge Sources You May Use
|
| 53 |
+
Participants are encouraged to use publicly available reference material as a knowledge base.
|
| 54 |
+
This material is not curated or simplified. It must be retrieved and interpreted by your system.
|
| 55 |
+
2
|
| 56 |
+
Examples of public knowledge sources include:
|
| 57 |
+
• Schlumberger Oilfield Glossary
|
| 58 |
+
This explains drilling terminology such as rate of penetration, tripping, circulation, and
|
| 59 |
+
non-productive time.
|
| 60 |
+
• SPE PetroWiki
|
| 61 |
+
This contains articles explaining drilling concepts, tools, and operational practices.
|
| 62 |
+
• IADC drilling terminology documents
|
| 63 |
+
These explain standard drilling acronyms and definitions.
|
| 64 |
+
• General engineering references related to drilling and well construction.
|
| 65 |
+
You may use these sources to help your system understand domain terms and concepts.
|
| 66 |
+
What Your System Must Do
|
| 67 |
+
Your system should function as an intelligent agent. It should be able to answer operational
|
| 68 |
+
questions using both numerical data and written reports.
|
| 69 |
+
The types of questions will cover multiple levels of reasoning.
|
| 70 |
+
Drilling Phase Identification & Validation
|
| 71 |
+
• Identify and label the major drilling phases for <Well Name> over the selected interval,
|
| 72 |
+
including the evidence used for each phase.
|
| 73 |
+
• Detect significant operational or phase transitions, noting when they occurred and why
|
| 74 |
+
they matter.
|
| 75 |
+
• Assess how well the inferred drilling phases align with the daily drilling reports.
|
| 76 |
+
• Identify periods where the operational state is ambiguous and explain the sources of
|
| 77 |
+
uncertainty.
|
| 78 |
+
Time & Efficiency Analysis
|
| 79 |
+
• Distinguish between productive and non-productive drilling time, and justify the criteria
|
| 80 |
+
used.
|
| 81 |
+
• Define drilling efficiency for <Well Name> and evaluate how it changes over time.
|
| 82 |
+
• Compare overall drilling efficiency between <Well Name> and at least one other well.
|
| 83 |
+
• Evaluate whether higher drilling speed was associated with stable operations or
|
| 84 |
+
increased operational risk.
|
| 85 |
+
3
|
| 86 |
+
Section & ROP Performance
|
| 87 |
+
• Determine which hole section appears easiest to drill and which appears most
|
| 88 |
+
challenging, with supporting evidence.
|
| 89 |
+
• Analyze how rate of penetration varies across sections and describe notable trends.
|
| 90 |
+
• Identify periods of exceptional drilling performance and explain why they stand out.
|
| 91 |
+
Configuration & BHA Effectiveness
|
| 92 |
+
• Identify the most effective drilling configuration or BHA run and explain the context.
|
| 93 |
+
• Assess whether changes in configuration coincide with changes in performance.
|
| 94 |
+
• Evaluate configuration effectiveness by hole section.
|
| 95 |
+
• Identify configurations that appear robust across operating conditions, as well as those
|
| 96 |
+
that underperformed and potential reasons why.
|
| 97 |
+
• Assess how daily drilling reports support or contradict conclusions about configuration
|
| 98 |
+
effectiveness.
|
| 99 |
+
Operational Issues & Root Causes
|
| 100 |
+
• Identify key operational issues encountered while drilling <Well Name>.
|
| 101 |
+
• Propose likely contributing factors or root causes.
|
| 102 |
+
• Analyze whether these issues persisted, resolved, or recurred over time.
|
| 103 |
+
• Highlight areas where drilling data and daily reports provide conflicting interpretations.
|
| 104 |
+
Synthesis & Recommendations
|
| 105 |
+
• Compare the drilling phase distribution of <Well Name> with another well <Well
|
| 106 |
+
Name1> and explain key differences.
|
| 107 |
+
• Describe remaining uncertainties in the analysis and their potential impact.
|
| 108 |
+
• Determine which operational team(s) should be notified based on the findings, and why.
|
| 109 |
+
• Produce a concise operational handover summary for the next shift.
|
| 110 |
+
• Extract key lessons learned that could apply to future wells.
|
| 111 |
+
• Based on observed trends, describe expected performance in a similar section of
|
| 112 |
+
another well.
|
| 113 |
+
• Recommend a drilling configuration for similar conditions.
|
| 114 |
+
• Identify what additional data would most improve confidence in the conclusions.
|
| 115 |
+
Expected Output Format
|
| 116 |
+
For each question, your system should provide:
|
| 117 |
+
4
|
| 118 |
+
• A clear answer
|
| 119 |
+
• Evidence from drilling data
|
| 120 |
+
• Evidence from daily reports
|
| 121 |
+
• Explanation of reasoning
|
| 122 |
+
• Statement of assumptions
|
| 123 |
+
• Confidence level or uncertainty
|
| 124 |
+
Answers should be understandable to an engineer reviewing your work.
|
| 125 |
+
Design Criteria
|
| 126 |
+
You may use:
|
| 127 |
+
• Open source libraries
|
| 128 |
+
• Local language models
|
| 129 |
+
• Free tier cloud models
|
| 130 |
+
• Statistical analysis methods
|
| 131 |
+
• Machine learning models
|
| 132 |
+
• Retrieval augmented generation systems
|
| 133 |
+
• Tool based agents
|
| 134 |
+
You are not required to use any proprietary software.
|
| 135 |
+
Your system design should prioritize:
|
| 136 |
+
• Transparency
|
| 137 |
+
• Traceability of evidence
|
| 138 |
+
• Clear reasoning
|
| 139 |
+
• Reproducibility
|
| 140 |
+
Complexity alone will not be rewarded.
|
| 141 |
+
Evaluation Criteria
|
| 142 |
+
Evaluation will be based on a structured question set.
|
| 143 |
+
Solutions will be assessed based on:
|
| 144 |
+
• Quality of reasoning
|
| 145 |
+
• Correct and relevant use of evidence
|
| 146 |
+
5
|
| 147 |
+
• Consistency across answers
|
| 148 |
+
• Clarity of assumptions
|
| 149 |
+
• Handling of uncertainty
|
| 150 |
+
• Practical relevance of insights
|
| 151 |
+
There is no single correct answer for the questions. Different approaches are acceptable if they
|
| 152 |
+
are well justified and supported by evidence.
|
| 153 |
+
The evaluation emphasizes reasoning quality rather than matching a specific numeric answer.
|
| 154 |
+
Summary
|
| 155 |
+
This challenge asks you to build more than a predictive model. It asks you to design an AI system
|
| 156 |
+
that can read data, understand context, reason through engineering problems, and
|
| 157 |
+
communicate conclusions clearly.
|
| 158 |
+
The objective is to explore how intelligent systems can assist real world operational decision
|
| 159 |
+
making using raw data and public domain knowledge.
|
| 160 |
+
6
|
promptfooconfig.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# promptfooconfig.yaml
|
| 2 |
+
# --------------------
|
| 3 |
+
# Configuration for evaluating the AI Drilling Copilot Agents
|
| 4 |
+
#
|
| 5 |
+
# NOTE: This rubric is completely customizable!
|
| 6 |
+
# You can tweak the test cases, prompts, and evaluation rules perfectly
|
| 7 |
+
# to match the SPE GCS 2026 ML Challenge evaluation criteria.
|
| 8 |
+
|
| 9 |
+
description: "SPE GCS 2026: Agent Rubric Evaluation"
|
| 10 |
+
|
| 11 |
+
providers:
|
| 12 |
+
# Using Promptfoo's native Google provider. 3.1 is not fully supported by the npm plugin yet.
|
| 13 |
+
- id: google:gemini-2.5-flash-preview
|
| 14 |
+
label: "baseline-agent-model"
|
| 15 |
+
|
| 16 |
+
prompts:
|
| 17 |
+
- file://tests/prompts/analyst_prompt.txt
|
| 18 |
+
- file://tests/prompts/historian_prompt.txt
|
| 19 |
+
- file://tests/prompts/auditor_prompt.txt
|
| 20 |
+
- file://tests/prompts/lead_prompt.txt
|
| 21 |
+
|
| 22 |
+
tests:
|
| 23 |
+
- vars:
|
| 24 |
+
question: "Which hole section in well 15/9-19 B was the most challenging to drill?"
|
| 25 |
+
context: "DDR data shows NPT of 45 hours in the 12.25 inch section due to severe losses. WITSML confirms high torque fluctuations."
|
| 26 |
+
assert:
|
| 27 |
+
- type: "icontains"
|
| 28 |
+
value: "12.25"
|
| 29 |
+
- type: "llm-rubric"
|
| 30 |
+
value: "The response MUST explicitly state a 'Confidence Level' or 'Uncertainty'."
|
| 31 |
+
- type: "llm-rubric"
|
| 32 |
+
value: "The response must clearly state the evidence (either data or reports) used to make the conclusion."
|
| 33 |
+
|
| 34 |
+
- vars:
|
| 35 |
+
question: "What were the lessons learned regarding weather-induced NPT?"
|
| 36 |
+
context: "Historical Volve data indicates waiting on weather (WOW) caused 15% of all delays, particularly stalling riser pulling operations."
|
| 37 |
+
assert:
|
| 38 |
+
- type: "llm-rubric"
|
| 39 |
+
value: "The response must synthesize the context to identify actionable lessons learned, not just repeat the data."
|
| 40 |
+
- type: "not-icontains"
|
| 41 |
+
value: "As an AI language model"
|
requirements.txt
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── LLM / Agent orchestration ─────────────────────────────────────────────────
|
| 2 |
+
crewai==1.10.1
|
| 3 |
+
google-generativeai==0.8.6
|
| 4 |
+
langchain==1.2.10
|
| 5 |
+
langchain-core==1.2.19
|
| 6 |
+
langchain-community==0.4.1
|
| 7 |
+
langchain-google-genai==4.2.1
|
| 8 |
+
|
| 9 |
+
# ── RAG / Vector store ────────────────────────────────────────────────────────
|
| 10 |
+
chromadb==1.5.5
|
| 11 |
+
sentence-transformers==5.3.0
|
| 12 |
+
|
| 13 |
+
# ── Data processing ───────────────────────────────────────────────────────────
|
| 14 |
+
pandas==2.3.1
|
| 15 |
+
numpy==2.3.2
|
| 16 |
+
pdfplumber==0.11.9
|
| 17 |
+
openpyxl==3.1.5
|
| 18 |
+
|
| 19 |
+
# ── Visualisation ─────────────────────────────────────────────────────────────
|
| 20 |
+
plotly==6.3.0
|
| 21 |
+
matplotlib==3.10.5
|
| 22 |
+
kaleido==0.2.1 # 0.2.x uses bundled binary (no Chrome needed); 1.x requires Chrome
|
| 23 |
+
|
| 24 |
+
# ── UI ────────────────────────────────────────────────────────────────────────
|
| 25 |
+
gradio==6.9.0
|
| 26 |
+
|
| 27 |
+
# ── Utilities ─────────────────────────────────────────────────────────────────
|
| 28 |
+
python-dotenv==1.1.1
|
| 29 |
+
huggingface_hub>=0.23.0
|
| 30 |
+
requests==2.32.5
|
| 31 |
+
httpx==0.28.1
|
| 32 |
+
uvicorn==0.41.0
|
scripts/download_data.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
download_data.py
|
| 3 |
+
----------------
|
| 4 |
+
Downloads the ODIN runtime data (processed CSVs + ChromaDB knowledge bases)
|
| 5 |
+
from Hugging Face Hub into the local data/ directory.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python scripts/download_data.py
|
| 9 |
+
|
| 10 |
+
Requirements:
|
| 11 |
+
pip install huggingface_hub
|
| 12 |
+
"""
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
HF_REPO_ID = "SPE-GCS-2026/odin-volve-data" # <- update if repo is renamed
|
| 18 |
+
LOCAL_DIR = Path(__file__).parent.parent / "data"
|
| 19 |
+
|
| 20 |
+
def main():
|
| 21 |
+
try:
|
| 22 |
+
from huggingface_hub import snapshot_download
|
| 23 |
+
except ImportError:
|
| 24 |
+
print("huggingface_hub not installed. Run: pip install huggingface_hub")
|
| 25 |
+
sys.exit(1)
|
| 26 |
+
|
| 27 |
+
print(f"Downloading ODIN data from HuggingFace ({HF_REPO_ID}) …")
|
| 28 |
+
print(f"Destination: {LOCAL_DIR.resolve()}")
|
| 29 |
+
print("This may take a few minutes (~400 MB knowledge bases + processed CSVs).\n")
|
| 30 |
+
|
| 31 |
+
snapshot_download(
|
| 32 |
+
repo_id = HF_REPO_ID,
|
| 33 |
+
repo_type = "dataset",
|
| 34 |
+
local_dir = str(LOCAL_DIR),
|
| 35 |
+
ignore_patterns=["*.git*", "README.md"],
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
print("\nDone. You can now run the app:")
|
| 39 |
+
print(" python src/agents/app.py")
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
main()
|
scripts/upload_data.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
upload_data.py
|
| 3 |
+
--------------
|
| 4 |
+
Uploads the ODIN runtime data to Hugging Face Hub (run this ONCE as the repo owner).
|
| 5 |
+
|
| 6 |
+
Uploads:
|
| 7 |
+
data/processed/ — cleaned DDR / WITSML / EDM CSVs
|
| 8 |
+
data/knowledge_base/ — Volve history ChromaDB vector store
|
| 9 |
+
data/viking_context/ — OpenViking ChromaDB vector store
|
| 10 |
+
|
| 11 |
+
Usage:
|
| 12 |
+
huggingface-cli login # authenticate first
|
| 13 |
+
python scripts/upload_data.py
|
| 14 |
+
|
| 15 |
+
Requirements:
|
| 16 |
+
pip install huggingface_hub
|
| 17 |
+
"""
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
HF_REPO_ID = "SPE-GCS-2026/odin-volve-data" # <- your HF org/username + repo name
|
| 22 |
+
ROOT = Path(__file__).parent.parent
|
| 23 |
+
|
| 24 |
+
UPLOAD_DIRS = [
|
| 25 |
+
ROOT / "data" / "processed",
|
| 26 |
+
ROOT / "data" / "knowledge_base",
|
| 27 |
+
ROOT / "data" / "viking_context",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
def main():
|
| 31 |
+
try:
|
| 32 |
+
from huggingface_hub import HfApi, create_repo
|
| 33 |
+
except ImportError:
|
| 34 |
+
print("huggingface_hub not installed. Run: pip install huggingface_hub")
|
| 35 |
+
sys.exit(1)
|
| 36 |
+
|
| 37 |
+
api = HfApi()
|
| 38 |
+
|
| 39 |
+
# Create dataset repo if it doesn't exist
|
| 40 |
+
try:
|
| 41 |
+
create_repo(HF_REPO_ID, repo_type="dataset", exist_ok=True, private=False)
|
| 42 |
+
print(f"Dataset repo ready: https://huggingface.co/datasets/{HF_REPO_ID}\n")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"Repo creation warning (may already exist): {e}")
|
| 45 |
+
|
| 46 |
+
for folder in UPLOAD_DIRS:
|
| 47 |
+
if not folder.exists():
|
| 48 |
+
print(f"Skipping {folder} (not found)")
|
| 49 |
+
continue
|
| 50 |
+
hf_path = folder.relative_to(ROOT) # e.g. data/processed
|
| 51 |
+
print(f"Uploading {folder} → {hf_path} …")
|
| 52 |
+
api.upload_folder(
|
| 53 |
+
repo_id = HF_REPO_ID,
|
| 54 |
+
repo_type = "dataset",
|
| 55 |
+
folder_path = str(folder),
|
| 56 |
+
path_in_repo= str(hf_path),
|
| 57 |
+
)
|
| 58 |
+
print(f" ✓ {hf_path} uploaded\n")
|
| 59 |
+
|
| 60 |
+
print("All done. Judges can now download with:")
|
| 61 |
+
print(" python scripts/download_data.py")
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
main()
|
src/__init__.py
ADDED
|
File without changes
|
src/agents/__init__.py
ADDED
|
File without changes
|
src/agents/answer_challenge.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
answer_challenge.py
|
| 3 |
+
-------------------
|
| 4 |
+
CLI entry point for the Drilling Intelligence System.
|
| 5 |
+
Uses the lean orchestrator (1-2 LLM calls) instead of CrewAI (10+ LLM calls).
|
| 6 |
+
"""
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from src.agents.orchestrator import run_pipeline
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 13 |
+
log = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def main(question: str):
|
| 17 |
+
print("\n" + "=" * 70)
|
| 18 |
+
print("⛽ DRILLING INTELLIGENCE SYSTEM")
|
| 19 |
+
print("=" * 70)
|
| 20 |
+
print(f"\nQuestion: {question}\n")
|
| 21 |
+
print("-" * 70)
|
| 22 |
+
|
| 23 |
+
answer, needs, evidence, steps = run_pipeline(question)
|
| 24 |
+
|
| 25 |
+
print("\n" + "=" * 70)
|
| 26 |
+
print("📄 FINAL REPORT")
|
| 27 |
+
print("=" * 70)
|
| 28 |
+
print(answer)
|
| 29 |
+
|
| 30 |
+
# Save to file
|
| 31 |
+
out_path = Path("challenge_output.md")
|
| 32 |
+
out_path.write_text(answer, encoding="utf-8")
|
| 33 |
+
print(f"\n💾 Report saved to {out_path.absolute()}")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
if len(sys.argv) < 2:
|
| 38 |
+
print("Usage: python src/agents/answer_challenge.py \"<Your Question>\"")
|
| 39 |
+
print('Example: python src/agents/answer_challenge.py "What is rate of penetration?"')
|
| 40 |
+
sys.exit(1)
|
| 41 |
+
|
| 42 |
+
main(sys.argv[1])
|
src/agents/app.py
ADDED
|
@@ -0,0 +1,1073 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py
|
| 3 |
+
------
|
| 4 |
+
Odin Drilling Intelligence System — Competition UI v3.0
|
| 5 |
+
SPE GCS 2026 ML Challenge · Full Redesign
|
| 6 |
+
|
| 7 |
+
Changes from v2.3:
|
| 8 |
+
- Tabbed right panel: Challenge Questions | Agent HUD | Charts
|
| 9 |
+
- 24 challenge-aligned question buttons covering all rubric categories
|
| 10 |
+
- Vertical pipeline HUD with telemetry (tools used, elapsed time, action count)
|
| 11 |
+
- Well selector dropdown (all 23 Volve wells) with auto-injection into queries
|
| 12 |
+
- Answer metadata chips: sources used + confidence badge + elapsed time
|
| 13 |
+
- Dedicated chart panel (no more iframes inside chat)
|
| 14 |
+
- Export to Markdown button
|
| 15 |
+
- Clear session button
|
| 16 |
+
- Clean brand header (no internal version/phase strings)
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import time
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
import tempfile
|
| 23 |
+
import gradio as gr
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from src.agents.orchestrator import run_pipeline
|
| 26 |
+
|
| 27 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 28 |
+
# DATA: Wells + Challenge Questions
|
| 29 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 30 |
+
|
| 31 |
+
SUGGESTED_PROMPTS = [
|
| 32 |
+
"Analyze and provide a chart of the drilling phase distribution and NPT breakdown for 15/9-F-12, with evidence from DDR and WITSML.",
|
| 33 |
+
"What were the main stuck pipe and wellbore stability events across the Volve campaign, and what formation was responsible?",
|
| 34 |
+
"Produce an operational handover summary for 15/9-F-14 and recommend a BHA configuration for the next 12.25-inch section.",
|
| 35 |
+
"Do an in-depth analysis of the drilling performance of three Volve wells and compare their key KPIs.",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 39 |
+
# CSS
|
| 40 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 41 |
+
|
| 42 |
+
CUSTOM_CSS = """
|
| 43 |
+
@import url('https://fonts.googleapis.com/css2?family=Share+Tech+Mono&family=Inter:wght@400;500;600;700;900&display=swap');
|
| 44 |
+
|
| 45 |
+
/* ── Base ── */
|
| 46 |
+
.gradio-container {
|
| 47 |
+
max-width: 100% !important;
|
| 48 |
+
padding: 0 !important;
|
| 49 |
+
font-family: 'Inter', sans-serif;
|
| 50 |
+
background: #030712 !important;
|
| 51 |
+
min-height: 100vh;
|
| 52 |
+
}
|
| 53 |
+
footer { display: none !important; }
|
| 54 |
+
|
| 55 |
+
/* Custom scrollbars */
|
| 56 |
+
::-webkit-scrollbar { width: 5px; height: 5px; }
|
| 57 |
+
::-webkit-scrollbar-track { background: #0f172a; }
|
| 58 |
+
::-webkit-scrollbar-thumb { background: #1e293b; border-radius: 3px; }
|
| 59 |
+
::-webkit-scrollbar-thumb:hover { background: #10b981; }
|
| 60 |
+
|
| 61 |
+
/* ── Header ── */
|
| 62 |
+
#odin-header {
|
| 63 |
+
background: #020617 !important;
|
| 64 |
+
border-bottom: 1px solid #0d2a1f !important;
|
| 65 |
+
box-shadow: 0 1px 0 #10b98122, 0 4px 24px #00000066 !important;
|
| 66 |
+
padding: 0 20px !important;
|
| 67 |
+
height: 54px;
|
| 68 |
+
align-items: center !important;
|
| 69 |
+
flex-wrap: nowrap !important;
|
| 70 |
+
gap: 12px !important;
|
| 71 |
+
}
|
| 72 |
+
.odin-logo-wrap {
|
| 73 |
+
display: flex; align-items: center; gap: 10px; text-decoration: none;
|
| 74 |
+
}
|
| 75 |
+
.odin-rune {
|
| 76 |
+
font-family: 'Share Tech Mono', monospace;
|
| 77 |
+
font-size: 1.6em; font-weight: 900;
|
| 78 |
+
color: #10b981;
|
| 79 |
+
text-shadow: 0 0 12px #10b98166, 0 0 24px #10b98133;
|
| 80 |
+
letter-spacing: 4px;
|
| 81 |
+
line-height: 1;
|
| 82 |
+
}
|
| 83 |
+
.odin-divider {
|
| 84 |
+
width: 1px; height: 26px; background: #1e293b; flex-shrink: 0;
|
| 85 |
+
}
|
| 86 |
+
.odin-wordmark {
|
| 87 |
+
font-size: 0.68em; color: #475569; line-height: 1.3;
|
| 88 |
+
font-family: 'Share Tech Mono', monospace; letter-spacing: 0.5px;
|
| 89 |
+
}
|
| 90 |
+
.odin-wordmark strong { color: #94a3b8; font-weight: 600; }
|
| 91 |
+
.odin-stats {
|
| 92 |
+
margin-left: auto;
|
| 93 |
+
display: flex; gap: 16px; align-items: center;
|
| 94 |
+
}
|
| 95 |
+
.odin-stat {
|
| 96 |
+
font-family: 'Share Tech Mono', monospace;
|
| 97 |
+
font-size: 0.66em; color: #334155; line-height: 1.3; text-align: center;
|
| 98 |
+
}
|
| 99 |
+
.odin-stat span { display: block; color: #10b981; font-weight: 700; font-size: 1.15em; }
|
| 100 |
+
|
| 101 |
+
/* ── Chat column ── */
|
| 102 |
+
#chat-col {
|
| 103 |
+
background: #030712 !important;
|
| 104 |
+
border-right: 1px solid #0f172a !important;
|
| 105 |
+
}
|
| 106 |
+
.chatbot-wrap {
|
| 107 |
+
background: #030712 !important;
|
| 108 |
+
border: none !important;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* User bubbles */
|
| 112 |
+
.message.user {
|
| 113 |
+
background: linear-gradient(135deg, #0f2a1e 0%, #0d2234 100%) !important;
|
| 114 |
+
color: #e2e8f0 !important;
|
| 115 |
+
border: 1px solid #1a3a2a !important;
|
| 116 |
+
border-radius: 10px 10px 2px 10px !important;
|
| 117 |
+
}
|
| 118 |
+
/* Bot bubbles */
|
| 119 |
+
.message.bot {
|
| 120 |
+
background: #0a0f1e !important;
|
| 121 |
+
color: #cbd5e1 !important;
|
| 122 |
+
border: 1px solid #0f172a !important;
|
| 123 |
+
border-left: 2px solid #10b98133 !important;
|
| 124 |
+
border-radius: 2px 10px 10px 10px !important;
|
| 125 |
+
}
|
| 126 |
+
/* Code blocks in responses */
|
| 127 |
+
.message.bot code { background: #0f172a !important; color: #6ee7b7 !important; font-family: 'Share Tech Mono', monospace !important; font-size: 0.88em !important; }
|
| 128 |
+
.message.bot pre { background: #0a0f1e !important; border: 1px solid #1e293b !important; border-left: 3px solid #10b981 !important; }
|
| 129 |
+
/* Tables */
|
| 130 |
+
.message.bot table { font-size: 0.83em !important; border-collapse: collapse !important; }
|
| 131 |
+
.message.bot th { background: #0f172a !important; color: #10b981 !important; border: 1px solid #1e293b !important; padding: 4px 8px !important; font-family: 'Share Tech Mono', monospace; }
|
| 132 |
+
.message.bot td { border: 1px solid #1e293b !important; padding: 3px 8px !important; color: #94a3b8 !important; }
|
| 133 |
+
.message.bot tr:nth-child(even) td { background: #0a0f1e !important; }
|
| 134 |
+
|
| 135 |
+
/* ── Input zone ── */
|
| 136 |
+
#input-zone {
|
| 137 |
+
padding: 10px 16px 12px !important;
|
| 138 |
+
background: #030712 !important;
|
| 139 |
+
border-top: 1px solid #0f172a !important;
|
| 140 |
+
align-items: flex-end !important;
|
| 141 |
+
gap: 8px !important;
|
| 142 |
+
}
|
| 143 |
+
#msg-input textarea {
|
| 144 |
+
background: #0a0f1e !important;
|
| 145 |
+
color: #e2e8f0 !important;
|
| 146 |
+
border: 1px solid #1e293b !important;
|
| 147 |
+
border-radius: 8px !important;
|
| 148 |
+
font-size: 0.9em !important;
|
| 149 |
+
font-family: 'Inter', sans-serif !important;
|
| 150 |
+
resize: none !important;
|
| 151 |
+
}
|
| 152 |
+
#msg-input textarea:focus {
|
| 153 |
+
border-color: #10b981 !important;
|
| 154 |
+
box-shadow: 0 0 0 2px #10b98122 !important;
|
| 155 |
+
}
|
| 156 |
+
#msg-input textarea::placeholder { color: #334155 !important; }
|
| 157 |
+
#send-btn {
|
| 158 |
+
background: linear-gradient(135deg, #059669 0%, #047857 100%) !important;
|
| 159 |
+
border: 1px solid #065f46 !important;
|
| 160 |
+
font-weight: 700 !important;
|
| 161 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 162 |
+
letter-spacing: 1px !important;
|
| 163 |
+
box-shadow: 0 2px 8px #10b98133 !important;
|
| 164 |
+
transition: all 0.2s !important;
|
| 165 |
+
}
|
| 166 |
+
#send-btn:hover {
|
| 167 |
+
background: linear-gradient(135deg, #10b981 0%, #059669 100%) !important;
|
| 168 |
+
box-shadow: 0 4px 16px #10b98144 !important;
|
| 169 |
+
transform: translateY(-1px) !important;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
/* ── Meta chips bar ── */
|
| 173 |
+
#meta-bar { padding: 5px 16px 2px; background: #030712; min-height: 28px; }
|
| 174 |
+
|
| 175 |
+
/* ── Chart area ── */
|
| 176 |
+
#chart-area { padding: 0 4px; }
|
| 177 |
+
/* export-file is always in the DOM (hidden via size, not display:none)
|
| 178 |
+
so JS getElementById works even before the user clicks Export */
|
| 179 |
+
#export-file { height: 0 !important; overflow: hidden !important;
|
| 180 |
+
padding: 0 !important; margin: 0 !important; }
|
| 181 |
+
|
| 182 |
+
/* ── Right panel ── */
|
| 183 |
+
#right-panel {
|
| 184 |
+
background: #020617 !important;
|
| 185 |
+
border-left: 1px solid #0f172a !important;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
/* ── Tabs ── */
|
| 189 |
+
.tabs { background: transparent !important; }
|
| 190 |
+
.tab-nav {
|
| 191 |
+
background: #020617 !important;
|
| 192 |
+
border-bottom: 1px solid #0f172a !important;
|
| 193 |
+
padding: 0 10px !important;
|
| 194 |
+
}
|
| 195 |
+
.tab-nav button {
|
| 196 |
+
color: #334155 !important;
|
| 197 |
+
font-size: 0.75em !important;
|
| 198 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 199 |
+
letter-spacing: 0.5px !important;
|
| 200 |
+
padding: 10px 10px !important;
|
| 201 |
+
border-bottom: 2px solid transparent !important;
|
| 202 |
+
transition: all 0.2s !important;
|
| 203 |
+
}
|
| 204 |
+
.tab-nav button:hover { color: #64748b !important; }
|
| 205 |
+
.tab-nav button.selected { color: #10b981 !important; border-bottom-color: #10b981 !important; }
|
| 206 |
+
|
| 207 |
+
/* ── Suggested prompts ── */
|
| 208 |
+
.prompts-scroll { max-height: calc(100vh - 130px); overflow-y: auto; padding: 12px 14px; }
|
| 209 |
+
.prompt-hint {
|
| 210 |
+
font-size: 0.68em; color: #1e3a2a;
|
| 211 |
+
padding: 6px 10px 12px; line-height: 1.6;
|
| 212 |
+
font-family: 'Share Tech Mono', monospace;
|
| 213 |
+
border-left: 2px solid #10b98133; margin-bottom: 8px;
|
| 214 |
+
}
|
| 215 |
+
.p-btn {
|
| 216 |
+
display: block !important; width: 100% !important; text-align: left !important;
|
| 217 |
+
padding: 10px 12px !important; margin: 6px 0 !important;
|
| 218 |
+
background: #0a0f1e !important;
|
| 219 |
+
border: 1px solid #1e293b !important;
|
| 220 |
+
border-left: 3px solid #1e3a2a !important;
|
| 221 |
+
border-radius: 6px !important; cursor: pointer !important;
|
| 222 |
+
color: #64748b !important; font-size: 0.77em !important; line-height: 1.55 !important;
|
| 223 |
+
white-space: normal !important; height: auto !important;
|
| 224 |
+
transition: all 0.2s !important;
|
| 225 |
+
font-family: 'Inter', sans-serif !important;
|
| 226 |
+
}
|
| 227 |
+
.p-btn:hover {
|
| 228 |
+
background: #0d1f18 !important;
|
| 229 |
+
border-color: #1e3a2a !important;
|
| 230 |
+
border-left-color: #10b981 !important;
|
| 231 |
+
color: #a7f3d0 !important;
|
| 232 |
+
transform: translateX(3px) !important;
|
| 233 |
+
box-shadow: -3px 0 12px #10b98122 !important;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
/* ── Pipeline HUD tab ── */
|
| 237 |
+
.hud-scroll { overflow-y: auto; padding: 10px 12px; display:flex; flex-direction:column; gap:10px; }
|
| 238 |
+
.pipe-title {
|
| 239 |
+
color: #10b981; font-weight: 700; text-transform: uppercase;
|
| 240 |
+
letter-spacing: 2px; font-size: 0.65em; margin-bottom: 10px;
|
| 241 |
+
font-family: 'Share Tech Mono', monospace;
|
| 242 |
+
display: flex; align-items: center; gap: 6px;
|
| 243 |
+
}
|
| 244 |
+
.pipe-title::after {
|
| 245 |
+
content: ''; flex: 1; height: 1px; background: linear-gradient(to right, #1e293b, transparent);
|
| 246 |
+
}
|
| 247 |
+
.pipe-track { border-left: 2px solid #0f172a; margin-left: 8px; padding-left: 14px; }
|
| 248 |
+
.pipe-step {
|
| 249 |
+
position: relative; display: flex; align-items: center; gap: 8px;
|
| 250 |
+
padding: 6px 8px; margin-bottom: 6px;
|
| 251 |
+
border-radius: 6px; background: #0a0f1e; border: 1px solid #0f172a;
|
| 252 |
+
transition: all 0.3s ease; opacity: 0.25; filter: grayscale(1); font-size: 0.79em;
|
| 253 |
+
}
|
| 254 |
+
.pipe-step.active { opacity:1; filter:none; background:#051a11; border-color:#10b981; animation:pipeGlow 2s infinite; }
|
| 255 |
+
.pipe-step.complete { opacity:0.8; filter:none; background:#0a0f1e; border-color:#1e3a5f; }
|
| 256 |
+
.pipe-step.delegating{ opacity:1; filter:none; background:#150d2a; border-color:#8b5cf6; animation:pipeDel 1.5s ease infinite; }
|
| 257 |
+
.pipe-dot { width:7px; height:7px; border-radius:50%; background:#1e293b; flex-shrink:0; position:absolute; left:-18px; top:11px; }
|
| 258 |
+
.pipe-step.active .pipe-dot { background:#10b981; box-shadow:0 0 6px #10b981; }
|
| 259 |
+
.pipe-step.complete .pipe-dot { background:#3b82f6; }
|
| 260 |
+
.pipe-step.delegating .pipe-dot{ background:#8b5cf6; }
|
| 261 |
+
.pipe-icon { font-size:0.95em; flex-shrink:0; }
|
| 262 |
+
.pipe-name { font-weight:600; color:#64748b; white-space:nowrap; font-size:0.95em; }
|
| 263 |
+
.pipe-sub { font-size:0.82em; color:#334155; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; max-width:140px; }
|
| 264 |
+
.pipe-step.active .pipe-name { color:#a7f3d0; }
|
| 265 |
+
.pipe-step.active .pipe-sub { color:#6ee7b7; }
|
| 266 |
+
.pipe-step.complete .pipe-name { color:#7dd3fc; }
|
| 267 |
+
.pipe-step.complete .pipe-sub { color:#334155; }
|
| 268 |
+
.pipe-step.delegating .pipe-name{ color:#c4b5fd; }
|
| 269 |
+
/* KB mini-nodes */
|
| 270 |
+
.pipe-kb-row { display:flex; gap:5px; margin-bottom:8px; }
|
| 271 |
+
.pipe-kb-node { flex:1; display:flex; align-items:center; gap:5px; padding:5px 7px; border-radius:6px; font-size:0.74em; background:#0a0f1e; border:1px solid #0f172a; opacity:0.25; filter:grayscale(1); transition:all 0.3s; }
|
| 272 |
+
.pipe-kb-node.active { opacity:1; filter:none; background:#051a11; border-color:#10b981; animation:pipeGlow 2s infinite; }
|
| 273 |
+
.pipe-kb-node.complete { opacity:0.8; filter:none; background:#0a0f1e; border-color:#1e3a5f; }
|
| 274 |
+
.pipe-kb-name { font-weight:600; color:#475569; display:block; font-size:0.9em; }
|
| 275 |
+
.pipe-kb-sub { color:#334155; display:block; font-size:0.82em; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; max-width:90px; }
|
| 276 |
+
.pipe-kb-node.active .pipe-kb-name { color:#a7f3d0; }
|
| 277 |
+
.pipe-kb-node.active .pipe-kb-sub { color:#6ee7b7; }
|
| 278 |
+
.pipe-kb-node.complete .pipe-kb-name{ color:#7dd3fc; }
|
| 279 |
+
/* Telemetry */
|
| 280 |
+
.pipe-telemetry { padding:8px 10px; border-radius:6px; background:#04080f; border:1px solid #0f172a; font-size:0.75em; }
|
| 281 |
+
.telem-title { color:#1e293b; text-transform:uppercase; letter-spacing:1.5px; font-size:0.78em; margin-bottom:5px; font-family:'Share Tech Mono',monospace; }
|
| 282 |
+
.telem-chip { display:inline-block; padding:2px 7px; border-radius:4px; margin:2px 2px 2px 0; font-size:0.82em; font-weight:700; font-family:'Share Tech Mono',monospace; }
|
| 283 |
+
.telem-footer{ color:#1e293b; margin-top:5px; padding-top:5px; border-top:1px solid #0f172a; font-family:'Share Tech Mono',monospace; font-size:0.9em; }
|
| 284 |
+
/* Live Feed */
|
| 285 |
+
.feed-wrap { border-radius:7px; background:#04080f; border:1px solid #0f172a; overflow:hidden; }
|
| 286 |
+
.feed-header{ padding:5px 10px; background:#020617; border-bottom:1px solid #0f172a; font-size:0.65em; font-weight:700; color:#10b981; text-transform:uppercase; letter-spacing:2px; font-family:'Share Tech Mono',monospace; }
|
| 287 |
+
.feed-body { max-height:240px; overflow-y:auto; padding:4px 0; }
|
| 288 |
+
.feed-entry { display:flex; align-items:flex-start; gap:6px; padding:4px 10px; border-bottom:1px solid #04080f; font-size:0.75em; }
|
| 289 |
+
.feed-entry:last-child { border-bottom:none; }
|
| 290 |
+
.feed-entry.thought { background:#0a0f1e33; }
|
| 291 |
+
.feed-entry.tool { background:#051a1133; }
|
| 292 |
+
.feed-entry.handoff { background:#0c1a3333; border-left:2px solid #1e3a5f; }
|
| 293 |
+
.feed-entry.system { opacity:0.45; }
|
| 294 |
+
.feed-badge { flex-shrink:0; padding:1px 5px; border-radius:3px; font-size:0.77em; font-weight:700; white-space:nowrap; font-family:'Share Tech Mono',monospace; letter-spacing:0.3px; }
|
| 295 |
+
.feed-badge.analyst { background:#051a11; color:#6ee7b7; border:1px solid #064e3b; }
|
| 296 |
+
.feed-badge.historian { background:#1c0a04; color:#fed7aa; border:1px solid #7c2d12; }
|
| 297 |
+
.feed-badge.auditor { background:#060d1e; color:#bfdbfe; border:1px solid #1e3a8a; }
|
| 298 |
+
.feed-badge.engineer { background:#0f0a1e; color:#ddd6fe; border:1px solid #4c1d95; }
|
| 299 |
+
.feed-badge.system { background:#080c12; color:#475569; border:1px solid #1e293b; }
|
| 300 |
+
.feed-badge.tool-badge{ background:#04080f; color:#64748b; border:1px solid #0f172a; }
|
| 301 |
+
.feed-text { color:#334155; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; flex:1; }
|
| 302 |
+
.feed-text b{ color:#64748b; font-weight:600; }
|
| 303 |
+
|
| 304 |
+
/* ── Animations ── */
|
| 305 |
+
@keyframes pipeGlow {
|
| 306 |
+
0% { box-shadow: 0 0 0 0 rgba(16,185,129,.35); }
|
| 307 |
+
70% { box-shadow: 0 0 0 5px rgba(16,185,129,0); }
|
| 308 |
+
100% { box-shadow: 0 0 0 0 rgba(16,185,129,0); }
|
| 309 |
+
}
|
| 310 |
+
@keyframes pipeDel {
|
| 311 |
+
0%,100% { box-shadow: 0 0 0 0 rgba(139,92,246,.35); }
|
| 312 |
+
50% { box-shadow: 0 0 8px 2px rgba(139,92,246,.25); }
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
/* ── Responsive ─�� */
|
| 316 |
+
@media (max-width: 860px) {
|
| 317 |
+
#right-panel { border-left: none !important; border-top: 1px solid #0f172a !important; }
|
| 318 |
+
.odin-stats { display: none; }
|
| 319 |
+
}
|
| 320 |
+
"""
|
| 321 |
+
|
| 322 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 323 |
+
# RENDER HELPERS
|
| 324 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 325 |
+
|
| 326 |
+
_EMPTY_HUD_STATE = {
|
| 327 |
+
"q_status": "", "q_detail": "",
|
| 328 |
+
"iadc_status": "", "iadc_detail": "IADC Glossary · 2,400 chunks",
|
| 329 |
+
"volve_status": "", "volve_detail": "Volve DDR/EDM · 23K chunks",
|
| 330 |
+
"analyst_status": "", "analyst_detail": "Waiting",
|
| 331 |
+
"historian_status": "", "historian_detail": "Waiting",
|
| 332 |
+
"auditor_status": "", "auditor_detail": "Waiting",
|
| 333 |
+
"engineer_status": "", "engineer_detail": "Waiting",
|
| 334 |
+
"s_status": "", "s_detail": "Queued",
|
| 335 |
+
"tools_used": set(), "action_count": 0, "elapsed": 0.0,
|
| 336 |
+
"live_feed": [], # list of {icon, badge_class, badge, type, text}
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
_TOOL_COLORS = {
|
| 340 |
+
"DDR": ("#1e3a8a", "#bfdbfe"),
|
| 341 |
+
"WITSML": ("#064e3b", "#a7f3d0"),
|
| 342 |
+
"EDM": ("#7c2d12", "#fed7aa"),
|
| 343 |
+
"IADC": ("#4c1d95", "#ddd6fe"),
|
| 344 |
+
"Volve DB": ("#0c4a6e", "#bae6fd"),
|
| 345 |
+
"Python REPL": ("#1f2937", "#d1d5db"),
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
_AGENT_BADGE = {
|
| 350 |
+
"Drilling Data Analyst": ("analyst", "📊"),
|
| 351 |
+
"Volve Campaign Historian": ("historian", "📜"),
|
| 352 |
+
"Rig Operations Auditor": ("auditor", "📋"),
|
| 353 |
+
"Lead Drilling Engineer": ("engineer", "👷"),
|
| 354 |
+
"Rate Limiter": ("system", "⏳"),
|
| 355 |
+
"Router": ("system", "🔀"),
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def render_hud(state: dict) -> str:
|
| 360 |
+
state = {**_EMPTY_HUD_STATE, **state}
|
| 361 |
+
|
| 362 |
+
def _step(s_key, icon, label, d_key):
|
| 363 |
+
st = state.get(s_key, "")
|
| 364 |
+
det = (state.get(d_key, "") or "")[:36]
|
| 365 |
+
return f"""<div style="position:relative">
|
| 366 |
+
<div class="pipe-dot"></div>
|
| 367 |
+
<div class="pipe-step {st}">
|
| 368 |
+
<span class="pipe-icon">{icon}</span>
|
| 369 |
+
<div style="min-width:0;overflow:hidden">
|
| 370 |
+
<span class="pipe-name">{label}</span>
|
| 371 |
+
<span class="pipe-sub">{det}</span>
|
| 372 |
+
</div>
|
| 373 |
+
</div>
|
| 374 |
+
</div>"""
|
| 375 |
+
|
| 376 |
+
# KB dual-node row
|
| 377 |
+
iadc_sub = (state['iadc_detail'] or "IADC Glossary · 2,400 chunks")[:22]
|
| 378 |
+
volve_sub = (state['volve_detail'] or "Volve DDR/EDM · 23K chunks")[:22]
|
| 379 |
+
kb_row = f"""<div class="pipe-kb-row">
|
| 380 |
+
<div class="pipe-kb-node {state['iadc_status']}">
|
| 381 |
+
<span>📚</span>
|
| 382 |
+
<div><span class="pipe-kb-name">IADC DB</span><span class="pipe-kb-sub">{iadc_sub}</span></div>
|
| 383 |
+
</div>
|
| 384 |
+
<div class="pipe-kb-node {state['volve_status']}">
|
| 385 |
+
<span>🗂️</span>
|
| 386 |
+
<div><span class="pipe-kb-name">Volve DB</span><span class="pipe-kb-sub">{volve_sub}</span></div>
|
| 387 |
+
</div>
|
| 388 |
+
</div>"""
|
| 389 |
+
|
| 390 |
+
# Compact telemetry
|
| 391 |
+
tools = state.get("tools_used", set())
|
| 392 |
+
chips = "".join(
|
| 393 |
+
f'<span class="telem-chip" style="background:{bg};color:{fg}">{t}</span>'
|
| 394 |
+
for t, (bg, fg) in _TOOL_COLORS.items() if t in tools
|
| 395 |
+
) or '<span style="color:#334155">No tools yet</span>'
|
| 396 |
+
elapsed = state.get("elapsed", 0.0)
|
| 397 |
+
telemetry = f"""<div class="pipe-telemetry">
|
| 398 |
+
<div class="telem-title">Tools Used</div>
|
| 399 |
+
<div>{chips}</div>
|
| 400 |
+
<div class="telem-footer">⏱ {f"{elapsed:.0f}s" if elapsed else "--"} | 🔧 {state.get("action_count", 0)} actions</div>
|
| 401 |
+
</div>"""
|
| 402 |
+
|
| 403 |
+
# Live Feed — flat entries
|
| 404 |
+
feed_entries = ""
|
| 405 |
+
for entry in state.get("live_feed", []):
|
| 406 |
+
bclass = entry.get("badge_class", "system")
|
| 407 |
+
badge = entry.get("badge", "SYS")
|
| 408 |
+
text = entry.get("text", "")[:90]
|
| 409 |
+
etype = entry.get("type", "system")
|
| 410 |
+
feed_entries += (
|
| 411 |
+
f'<div class="feed-entry {etype}">'
|
| 412 |
+
f'<span class="feed-badge {bclass}">{badge}</span>'
|
| 413 |
+
f'<span class="feed-text">{text}</span>'
|
| 414 |
+
f'</div>'
|
| 415 |
+
)
|
| 416 |
+
if not feed_entries:
|
| 417 |
+
feed_entries = '<div style="padding:12px 10px;color:#334155;font-size:0.75em">Waiting for agent activity…</div>'
|
| 418 |
+
|
| 419 |
+
live_feed = f"""<div class="feed-wrap">
|
| 420 |
+
<div class="feed-header">// LIVE AGENT FEED</div>
|
| 421 |
+
<div class="feed-body">{feed_entries}</div>
|
| 422 |
+
</div>"""
|
| 423 |
+
|
| 424 |
+
return f"""<div class="hud-scroll">
|
| 425 |
+
<div>
|
| 426 |
+
<div class="pipe-title">▶ PIPELINE</div>
|
| 427 |
+
{_step("q_status", "❓", "Query", "q_detail")}
|
| 428 |
+
<div class="pipe-track">
|
| 429 |
+
{kb_row}
|
| 430 |
+
{_step("analyst_status", "📊", "Data Analyst", "analyst_detail")}
|
| 431 |
+
{_step("historian_status", "📜", "Historian", "historian_detail")}
|
| 432 |
+
{_step("auditor_status", "📋", "Auditor", "auditor_detail")}
|
| 433 |
+
{_step("engineer_status", "👷", "Odin", "engineer_detail")}
|
| 434 |
+
{_step("s_status", "✅", "Synthesis", "s_detail")}
|
| 435 |
+
</div>
|
| 436 |
+
{telemetry}
|
| 437 |
+
</div>
|
| 438 |
+
{live_feed}
|
| 439 |
+
</div>"""
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def extract_confidence_with_reason(text: str) -> tuple:
|
| 443 |
+
"""Returns (level: str|None, reason: str)."""
|
| 444 |
+
for pat in [
|
| 445 |
+
r'confidence[:\s*]+\**\s*(high|medium|low)\**',
|
| 446 |
+
r'\**(high|medium|low)\*\*\s+confidence',
|
| 447 |
+
r'(high|medium|low)\s+confidence',
|
| 448 |
+
]:
|
| 449 |
+
m = re.search(pat, text.lower())
|
| 450 |
+
if m:
|
| 451 |
+
level = m.group(1).upper()
|
| 452 |
+
# Extract a window of text around the match as the reasoning snippet
|
| 453 |
+
start = max(0, m.start() - 80)
|
| 454 |
+
end = min(len(text), m.end() + 250)
|
| 455 |
+
reason = text[start:end].strip().replace("\n", " ")
|
| 456 |
+
return level, reason
|
| 457 |
+
return None, ""
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
# Keep backward-compatible alias
|
| 461 |
+
def extract_confidence(text: str) -> str | None:
|
| 462 |
+
level, _ = extract_confidence_with_reason(text)
|
| 463 |
+
return level
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
_CONF_EXPLAIN = {
|
| 467 |
+
"HIGH": "Multiple independent data sources agree (DDR + WITSML ± EDM). No contradictions detected.",
|
| 468 |
+
"MEDIUM": "Primary data source used. Minor ambiguities or single-source validation.",
|
| 469 |
+
"LOW": "Limited data coverage, significant assumptions required, or conflicting signals.",
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
def render_metadata(tools: set, confidence: str | None, elapsed: float,
|
| 473 |
+
confidence_reason: str = "") -> str:
|
| 474 |
+
"""Compact one-line footer HTML to embed directly inside a bot chat message."""
|
| 475 |
+
if not tools and not confidence:
|
| 476 |
+
return ""
|
| 477 |
+
_conf_col = {"HIGH": ("#064e3b", "#6ee7b7"), "MEDIUM": ("#78350f", "#fde68a"), "LOW": ("#7f1d1d", "#fca5a5")}
|
| 478 |
+
_tool_labels = {"DDR": "DDR", "WITSML": "WITSML", "EDM": "EDM",
|
| 479 |
+
"IADC": "IADC", "Volve DB": "Volve", "Python REPL": "Python"}
|
| 480 |
+
parts = []
|
| 481 |
+
for t, (bg, fg) in _TOOL_COLORS.items():
|
| 482 |
+
if t in tools and t in _tool_labels:
|
| 483 |
+
parts.append(
|
| 484 |
+
f'<span style="background:{bg};color:{fg};padding:1px 6px;border-radius:3px;'
|
| 485 |
+
f'font-size:0.7em;font-weight:700;font-family:\'Share Tech Mono\',monospace">'
|
| 486 |
+
f'{_tool_labels[t]}</span>'
|
| 487 |
+
)
|
| 488 |
+
if confidence:
|
| 489 |
+
bg, fg = _conf_col.get(confidence, ("#1f2937", "#d1d5db"))
|
| 490 |
+
tip = (confidence_reason[:200] + "…") if confidence_reason else _CONF_EXPLAIN.get(confidence, "")
|
| 491 |
+
parts.append(
|
| 492 |
+
f'<span style="background:{bg};color:{fg};padding:1px 7px;border-radius:3px;'
|
| 493 |
+
f'font-size:0.7em;font-weight:700;cursor:default;font-family:\'Share Tech Mono\',monospace"'
|
| 494 |
+
f' title="{tip}">● {confidence}</span>'
|
| 495 |
+
)
|
| 496 |
+
if elapsed > 0:
|
| 497 |
+
parts.append(f'<span style="color:#1e3a2a;font-size:0.68em;font-family:\'Share Tech Mono\',monospace">⏱ {elapsed:.0f}s</span>')
|
| 498 |
+
inner = ' '.join(parts)
|
| 499 |
+
return (
|
| 500 |
+
f'<div style="margin-top:10px;padding-top:7px;border-top:1px solid #0d1a24;'
|
| 501 |
+
f'display:flex;gap:5px;align-items:center;flex-wrap:wrap">{inner}</div>'
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
def _chart_embed(p: str) -> str:
|
| 506 |
+
"""Return an embed snippet for a chart file — no file-serving required."""
|
| 507 |
+
import base64 as _b64
|
| 508 |
+
path = Path(p)
|
| 509 |
+
if not path.exists():
|
| 510 |
+
return f'<div style="color:#ef4444;padding:8px;font-size:0.8em">Missing: {path.name}</div>'
|
| 511 |
+
wrap = 'style="border-radius:8px;border:1px solid #1e293b;overflow:hidden;margin-bottom:14px"'
|
| 512 |
+
if p.endswith(".png"):
|
| 513 |
+
data = _b64.b64encode(path.read_bytes()).decode()
|
| 514 |
+
return f'<div {wrap}><img src="data:image/png;base64,{data}" style="width:100%;display:block"/></div>'
|
| 515 |
+
# HTML chart — base64 data URI avoids all srcdoc escaping issues
|
| 516 |
+
b64_html = _b64.b64encode(path.read_bytes()).decode()
|
| 517 |
+
return (f'<div {wrap}><iframe src="data:text/html;base64,{b64_html}" width="100%" height="480" '
|
| 518 |
+
f'frameborder="0" style="display:block" sandbox="allow-scripts"></iframe></div>')
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def render_charts(chart_paths: list) -> str:
|
| 522 |
+
if not chart_paths:
|
| 523 |
+
return """<div class="charts-scroll">
|
| 524 |
+
<div class="chart-empty">
|
| 525 |
+
<div style="font-size:2.5em">📊</div>
|
| 526 |
+
<div style="color:#475569;font-weight:600">No charts yet</div>
|
| 527 |
+
<div style="color:#334155;max-width:200px">
|
| 528 |
+
Ask about ROP, NPT, Days vs Depth, or well comparisons to trigger visualizations.
|
| 529 |
+
</div>
|
| 530 |
+
</div>
|
| 531 |
+
</div>"""
|
| 532 |
+
# Prefer PNG over HTML for the same chart stem
|
| 533 |
+
stems_with_png = {Path(p).stem for p in chart_paths if p.endswith(".png") and Path(p).exists()}
|
| 534 |
+
items = []
|
| 535 |
+
for p in chart_paths:
|
| 536 |
+
stem = Path(p).stem
|
| 537 |
+
if p.endswith(".html") and stem in stems_with_png:
|
| 538 |
+
continue # PNG version covers this chart
|
| 539 |
+
if not Path(p).exists():
|
| 540 |
+
continue
|
| 541 |
+
name = stem.replace("_", " ").title()
|
| 542 |
+
label = (f'<div style="color:#475569;font-size:0.7em;text-transform:uppercase;'
|
| 543 |
+
f'letter-spacing:1px;margin-bottom:4px">{name}</div>')
|
| 544 |
+
items.append(label + _chart_embed(p))
|
| 545 |
+
if not items:
|
| 546 |
+
return render_charts([]) # all paths missing → empty state
|
| 547 |
+
return f'<div class="charts-scroll">{"".join(items)}</div>'
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 553 |
+
# PIPELINE GENERATOR
|
| 554 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 555 |
+
|
| 556 |
+
_TOOL_KEYWORDS = [
|
| 557 |
+
("DDR_Query", "DDR"), ("DDR", "DDR"),
|
| 558 |
+
("WITSML_Analyst", "WITSML"), ("WITSML", "WITSML"),
|
| 559 |
+
("EDM_Technical", "EDM"), ("EDM", "EDM"),
|
| 560 |
+
("IADC_SearchTool", "IADC"), ("IADC", "IADC"),
|
| 561 |
+
("VolveHistory_SearchTool", "Volve DB"), ("VolveHistory", "Volve DB"), ("Volve", "Volve DB"),
|
| 562 |
+
("python_interpreter", "Python REPL"), ("Python REPL", "Python REPL"),
|
| 563 |
+
]
|
| 564 |
+
|
| 565 |
+
_AGENT_MAP = {
|
| 566 |
+
"Drilling Data Analyst": "analyst",
|
| 567 |
+
"Volve Campaign Historian": "historian",
|
| 568 |
+
"Rig Operations Auditor": "auditor",
|
| 569 |
+
"Lead Drilling Engineer": "engineer",
|
| 570 |
+
"Rig Crew": "analyst",
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
def chat_response(message, history):
|
| 575 |
+
if not message.strip():
|
| 576 |
+
yield history, gr.update(), render_hud(_EMPTY_HUD_STATE), gr.update(), "", gr.update()
|
| 577 |
+
return
|
| 578 |
+
|
| 579 |
+
query = message.strip()
|
| 580 |
+
t0 = time.time()
|
| 581 |
+
hud = {**_EMPTY_HUD_STATE, "q_status": "active", "q_detail": query[:40],
|
| 582 |
+
"iadc_status": "active", "iadc_detail": "Mandatory search…",
|
| 583 |
+
"tools_used": set()}
|
| 584 |
+
|
| 585 |
+
history = list(history) + [
|
| 586 |
+
{"role": "user", "content": message},
|
| 587 |
+
{"role": "assistant", "content": "⟳ Initializing Odin…"},
|
| 588 |
+
]
|
| 589 |
+
chart_paths = []
|
| 590 |
+
base_figures = Path(__file__).resolve().parents[2] / "outputs" / "figures"
|
| 591 |
+
|
| 592 |
+
yield history, gr.update(value=""), render_hud(hud), gr.update(), "", gr.update(value="", visible=True)
|
| 593 |
+
|
| 594 |
+
logs = ("<details open><summary style='cursor:pointer;color:#64748b;font-size:0.82em;"
|
| 595 |
+
"user-select:none;padding:4px 0'>⚙️ Thinking Process</summary>"
|
| 596 |
+
"<ul style='list-style:none;padding:2px 0 0;margin:0;font-family:monospace;font-size:0.79em;color:#475569'>")
|
| 597 |
+
step_log = [] # High-level event log (always captured)
|
| 598 |
+
verbose_log = "" # Full CrewAI stdout transcript (set before final_answer by orchestrator)
|
| 599 |
+
|
| 600 |
+
context_text = [f"{m['role'].upper()}: {m['content']}" for m in history[:-2]]
|
| 601 |
+
|
| 602 |
+
for event in run_pipeline(query, chat_history=context_text):
|
| 603 |
+
hud["elapsed"] = time.time() - t0
|
| 604 |
+
|
| 605 |
+
if event["event"] == "log":
|
| 606 |
+
name = event.get("name", "")
|
| 607 |
+
status = event.get("status", "")
|
| 608 |
+
detail = event.get("detail", "")
|
| 609 |
+
icon = event.get("icon", "•")
|
| 610 |
+
is_dia = event.get("is_dialogue", False)
|
| 611 |
+
ts = time.strftime("%H:%M:%S", time.localtime(event["time"]))
|
| 612 |
+
|
| 613 |
+
# ── Parse chart paths from tool output in real-time ──
|
| 614 |
+
for line in (detail + " " + status).split("\n"):
|
| 615 |
+
if "chart saved to:" in line.lower() or "interactive chart saved to:" in line.lower():
|
| 616 |
+
for part in line.split():
|
| 617 |
+
if part.endswith((".html", ".png")) and "/" in part:
|
| 618 |
+
if part not in chart_paths:
|
| 619 |
+
chart_paths.append(part)
|
| 620 |
+
|
| 621 |
+
# Tool tracking
|
| 622 |
+
for kw, label in _TOOL_KEYWORDS:
|
| 623 |
+
if kw in status or kw in detail:
|
| 624 |
+
hud["tools_used"].add(label)
|
| 625 |
+
|
| 626 |
+
if "Action:" in status:
|
| 627 |
+
hud["action_count"] = hud.get("action_count", 0) + 1
|
| 628 |
+
|
| 629 |
+
# HUD state machine
|
| 630 |
+
if name == "Classifier":
|
| 631 |
+
hud["q_status"] = "complete"
|
| 632 |
+
elif "IADC" in status or "IADC" in detail:
|
| 633 |
+
hud["iadc_status"] = "active"
|
| 634 |
+
hud["iadc_detail"] = "Searching definitions…"
|
| 635 |
+
elif "Volve" in status or "VolveHistory" in status or "Volve" in detail:
|
| 636 |
+
hud["volve_status"] = "active"
|
| 637 |
+
hud["volve_detail"] = "Searching 23K chunks…"
|
| 638 |
+
elif name == "Complete":
|
| 639 |
+
for k in ["q", "iadc", "volve", "analyst", "historian", "auditor", "engineer"]:
|
| 640 |
+
hud[f"{k}_status"] = "complete"
|
| 641 |
+
hud["s_status"] = "active"; hud["s_detail"] = "Synthesizing…"
|
| 642 |
+
|
| 643 |
+
if name in _AGENT_MAP:
|
| 644 |
+
pfx = _AGENT_MAP[name]
|
| 645 |
+
if pfx == "analyst":
|
| 646 |
+
if hud["iadc_status"] == "active": hud["iadc_status"] = "complete"
|
| 647 |
+
if hud["volve_status"] == "active": hud["volve_status"] = "complete"
|
| 648 |
+
if "Handoff Complete" in status:
|
| 649 |
+
hud[f"{pfx}_status"] = "complete"; hud[f"{pfx}_detail"] = "Done ✓"
|
| 650 |
+
else:
|
| 651 |
+
hud[f"{pfx}_status"] = "delegating" if is_dia else "active"
|
| 652 |
+
hud[f"{pfx}_detail"] = status[:36]
|
| 653 |
+
|
| 654 |
+
# ── Live feed entry ──
|
| 655 |
+
bclass, _ = _AGENT_BADGE.get(name, ("system", "•"))
|
| 656 |
+
badge_short = {"Drilling Data Analyst": "ANALYST", "Volve Campaign Historian": "HIST",
|
| 657 |
+
"Rig Operations Auditor": "AUDIT", "Lead Drilling Engineer": "ODIN",
|
| 658 |
+
"Rate Limiter": "RATE", "Router": "ROUTE"}.get(name, name[:6].upper())
|
| 659 |
+
if "Action:" in status:
|
| 660 |
+
tool_name = status.replace("Action:", "").strip()
|
| 661 |
+
inp = detail.replace("Input:", "").strip()[:50]
|
| 662 |
+
feed_text = f"<b>{tool_name}</b> ← {inp}" if inp else f"<b>{tool_name}</b>"
|
| 663 |
+
feed_type = "tool"
|
| 664 |
+
badge_short = tool_name[:12]
|
| 665 |
+
bclass = "tool-badge"
|
| 666 |
+
elif "Thought" in status:
|
| 667 |
+
feed_text = detail[:85]
|
| 668 |
+
feed_type = "thought"
|
| 669 |
+
elif "Handoff" in status or is_dia:
|
| 670 |
+
feed_text = detail[:85]
|
| 671 |
+
feed_type = "handoff"
|
| 672 |
+
elif name in ("Rate Limiter", "Router"):
|
| 673 |
+
feed_text = status[:85]
|
| 674 |
+
feed_type = "system"
|
| 675 |
+
else:
|
| 676 |
+
feed_text = None # skip low-signal events
|
| 677 |
+
|
| 678 |
+
if feed_text:
|
| 679 |
+
full_text = detail if "Thought" in status else (detail or status)
|
| 680 |
+
hud["live_feed"] = (hud.get("live_feed", []) + [
|
| 681 |
+
{"badge_class": bclass, "badge": badge_short, "type": feed_type,
|
| 682 |
+
"text": feed_text[:80], "full_text": full_text}
|
| 683 |
+
])[-12:]
|
| 684 |
+
|
| 685 |
+
# Collapsible log in chat
|
| 686 |
+
if is_dia:
|
| 687 |
+
logs += (f"<li style='margin:5px 0;padding:6px;background:#1e3a8a22;border-left:3px solid #3b82f6;"
|
| 688 |
+
f"border-radius:4px'>[{ts}] {icon} <b style='color:#93c5fd'>{name}</b>: "
|
| 689 |
+
f"<span style='color:#64748b'>{status}</span><br/>"
|
| 690 |
+
f"<span style='color:#475569;font-style:italic'>{detail[:120]}</span></li>")
|
| 691 |
+
else:
|
| 692 |
+
det = f" <i style='color:#334155'>{detail[:80]}</i>" if detail else ""
|
| 693 |
+
logs += f"<li style='margin:2px 0'>[{ts}] {icon} <b style='color:#64748b'>{name}</b>: <span style='color:#475569'>{status}</span>{det}</li>"
|
| 694 |
+
|
| 695 |
+
# Accumulate rich step log for MD export — use detail_full if available
|
| 696 |
+
detail_full = event.get("detail_full", detail)
|
| 697 |
+
step_log.append(
|
| 698 |
+
f"[{ts}] **{icon} {name}** — {status}" +
|
| 699 |
+
(f"\n\n```\n{detail_full}\n```" if detail_full else "")
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
history[-1]["content"] = logs + "</ul></details>"
|
| 703 |
+
yield history, gr.update(), render_hud(hud), gr.update(), "", gr.update()
|
| 704 |
+
|
| 705 |
+
elif event["event"] == "verbose_log":
|
| 706 |
+
# Full CrewAI terminal transcript — forwarded by orchestrator before final_answer
|
| 707 |
+
# Storing here so it's available when export_payload is built in final_answer handler
|
| 708 |
+
verbose_log = event.get("content", "")
|
| 709 |
+
|
| 710 |
+
elif event["event"] == "final_answer":
|
| 711 |
+
elapsed = time.time() - t0
|
| 712 |
+
hud["elapsed"] = elapsed
|
| 713 |
+
hud["s_status"] = "complete"
|
| 714 |
+
hud["s_detail"] = f"Done in {elapsed:.1f}s"
|
| 715 |
+
|
| 716 |
+
# Collect charts: sweep figures dir for files created during THIS query (since t0)
|
| 717 |
+
# Using t0 as cutoff prevents old charts from previous queries bleeding in
|
| 718 |
+
if base_figures.exists():
|
| 719 |
+
for ext in ["*.html", "*.png"]:
|
| 720 |
+
for p in sorted(base_figures.glob(ext), key=lambda x: x.stat().st_mtime, reverse=True):
|
| 721 |
+
if p.stat().st_mtime >= t0 - 5: # 5s grace for slow saves
|
| 722 |
+
sp = str(p.absolute())
|
| 723 |
+
if sp not in chart_paths:
|
| 724 |
+
chart_paths.append(sp)
|
| 725 |
+
|
| 726 |
+
answer = event.get("answer", "")
|
| 727 |
+
confidence, conf_reason = extract_confidence_with_reason(answer)
|
| 728 |
+
# Fallback: infer confidence from data sources used if LLM didn't state it
|
| 729 |
+
if not confidence:
|
| 730 |
+
data_tools = hud["tools_used"] & {"DDR", "WITSML", "EDM"}
|
| 731 |
+
if len(data_tools) >= 3:
|
| 732 |
+
confidence, conf_reason = "HIGH", "DDR + WITSML + EDM all queried and correlated."
|
| 733 |
+
elif len(data_tools) == 2:
|
| 734 |
+
confidence, conf_reason = "MEDIUM", f"Two sources used: {', '.join(sorted(data_tools))}."
|
| 735 |
+
elif data_tools:
|
| 736 |
+
confidence, conf_reason = "MEDIUM", f"Single data source: {list(data_tools)[0]}."
|
| 737 |
+
else:
|
| 738 |
+
confidence, conf_reason = "MEDIUM", "Knowledge base (IADC / Volve corpus) consulted."
|
| 739 |
+
meta_html = render_metadata(hud["tools_used"], confidence, elapsed, conf_reason)
|
| 740 |
+
|
| 741 |
+
# Embed charts inline in the chat message
|
| 742 |
+
chart_md, chart_html_fb = _embed_charts_inline(chart_paths)
|
| 743 |
+
|
| 744 |
+
closed_logs = logs.replace("<details open>", "<details>") + "</ul></details>"
|
| 745 |
+
# Meta chips embedded directly at bottom of bot message — no separate bar
|
| 746 |
+
history[-1]["content"] = closed_logs + "\n\n" + answer + chart_md + meta_html
|
| 747 |
+
|
| 748 |
+
# Pack export state with full step_log (not HTML-stripped log)
|
| 749 |
+
tools_list = sorted(hud["tools_used"])
|
| 750 |
+
export_payload = {
|
| 751 |
+
"answer": answer, "confidence": confidence or "",
|
| 752 |
+
"confidence_reason": conf_reason,
|
| 753 |
+
"tools": tools_list, "elapsed": elapsed,
|
| 754 |
+
"step_log": step_log, # high-level event log
|
| 755 |
+
"verbose_log": verbose_log, # full CrewAI stdout transcript
|
| 756 |
+
"chart_paths": chart_paths,
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
# Pre-compute download link so the Export button fires instantly (no queue wait)
|
| 760 |
+
_export_html_update = gr.update()
|
| 761 |
+
try:
|
| 762 |
+
import urllib.parse as _ul
|
| 763 |
+
_ep = export_answer(export_payload)
|
| 764 |
+
if _ep:
|
| 765 |
+
_enc = _ul.quote(open(_ep, encoding="utf-8").read(), safe="")
|
| 766 |
+
# Hidden anchor — Export MD button JS clicks it, no visible link shown
|
| 767 |
+
_export_html_update = gr.update(visible=True, value=(
|
| 768 |
+
f'<a id="odin-dl" href="data:text/markdown;charset=utf-8,{_enc}" download="odin_report.md"></a>'
|
| 769 |
+
))
|
| 770 |
+
except Exception:
|
| 771 |
+
pass
|
| 772 |
+
|
| 773 |
+
yield history, gr.update(value=""), render_hud(hud), gr.update(value=chart_html_fb), export_payload, _export_html_update
|
| 774 |
+
|
| 775 |
+
elif event["event"] == "error":
|
| 776 |
+
elapsed = time.time() - t0
|
| 777 |
+
err_msg = event.get("message", "Unknown error")
|
| 778 |
+
hud["s_status"] = "complete"
|
| 779 |
+
hud["s_detail"] = "Failed"
|
| 780 |
+
# Still sweep for charts — some may have been generated before the failure
|
| 781 |
+
if base_figures.exists():
|
| 782 |
+
for ext in ["*.html", "*.png"]:
|
| 783 |
+
for p in sorted(base_figures.glob(ext), key=lambda x: x.stat().st_mtime, reverse=True):
|
| 784 |
+
if time.time() - p.stat().st_mtime < 600:
|
| 785 |
+
sp = str(p.absolute())
|
| 786 |
+
if sp not in chart_paths:
|
| 787 |
+
chart_paths.append(sp)
|
| 788 |
+
closed_logs = logs.replace("<details open>", "<details>") + "</ul></details>"
|
| 789 |
+
error_block = (
|
| 790 |
+
f"\n\n> ⚠️ **Agent Error** — `{err_msg[:200]}`\n\n"
|
| 791 |
+
"_The crew encountered an error. This is usually a Gemini rate limit (429) "
|
| 792 |
+
"or max_iter exceeded — please wait 30–60 seconds and try again._"
|
| 793 |
+
)
|
| 794 |
+
# Any charts generated before the failure — show as HTML fallback
|
| 795 |
+
_, chart_html_fb = _embed_charts_inline(chart_paths)
|
| 796 |
+
history[-1]["content"] = closed_logs + error_block
|
| 797 |
+
yield history, gr.update(value=""), render_hud(hud), gr.update(value=chart_html_fb), None, gr.update(value="", visible=True)
|
| 798 |
+
|
| 799 |
+
|
| 800 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 801 |
+
# UTILITY HANDLERS
|
| 802 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 803 |
+
|
| 804 |
+
def _strip_html(html: str) -> str:
|
| 805 |
+
"""Minimal HTML → plain text for .md export."""
|
| 806 |
+
import re as _re
|
| 807 |
+
text = _re.sub(r'<br\s*/?>', '\n', html)
|
| 808 |
+
text = _re.sub(r'<li[^>]*>', '• ', text)
|
| 809 |
+
text = _re.sub(r'<[^>]+>', '', text)
|
| 810 |
+
return text.strip()
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
def export_answer(payload):
|
| 814 |
+
"""Generate a rich .md report from the export payload dict."""
|
| 815 |
+
if not payload:
|
| 816 |
+
return None
|
| 817 |
+
if isinstance(payload, str):
|
| 818 |
+
# Legacy fallback: just the answer string
|
| 819 |
+
payload = {"answer": payload, "confidence": "", "tools": [], "elapsed": 0,
|
| 820 |
+
"confidence_reason": "", "log_html": "", "chart_paths": []}
|
| 821 |
+
|
| 822 |
+
answer = payload.get("answer", "")
|
| 823 |
+
confidence = payload.get("confidence", "")
|
| 824 |
+
conf_reason = payload.get("confidence_reason", "")
|
| 825 |
+
tools = payload.get("tools", [])
|
| 826 |
+
elapsed = payload.get("elapsed", 0)
|
| 827 |
+
step_log = payload.get("step_log", [])
|
| 828 |
+
verbose_log = payload.get("verbose_log", "")
|
| 829 |
+
chart_paths = payload.get("chart_paths", [])
|
| 830 |
+
|
| 831 |
+
if not answer.strip():
|
| 832 |
+
return None
|
| 833 |
+
|
| 834 |
+
import datetime
|
| 835 |
+
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 836 |
+
|
| 837 |
+
lines = [
|
| 838 |
+
"# ⚡ Odin — Drilling Intelligence Report",
|
| 839 |
+
"_SPE GCS 2026 ML Challenge · Volve Field Dataset_",
|
| 840 |
+
f"\n**Generated:** {ts}",
|
| 841 |
+
]
|
| 842 |
+
if elapsed:
|
| 843 |
+
lines.append(f"**Response time:** {elapsed:.0f}s")
|
| 844 |
+
if tools:
|
| 845 |
+
lines.append(f"**Data sources:** {', '.join(tools)}")
|
| 846 |
+
if confidence:
|
| 847 |
+
lines.append(f"**Confidence:** {confidence}")
|
| 848 |
+
if conf_reason:
|
| 849 |
+
lines.append(f"> {conf_reason[:300]}")
|
| 850 |
+
|
| 851 |
+
lines += ["", "---", "", "## Analysis", "", answer]
|
| 852 |
+
|
| 853 |
+
if chart_paths:
|
| 854 |
+
lines += ["", "---", "", "## Charts Generated", ""]
|
| 855 |
+
for p in chart_paths:
|
| 856 |
+
lines.append(f"- `{p}`")
|
| 857 |
+
|
| 858 |
+
# Full agent transcript: prefer verbose_log (complete stdout) over step_log (event summaries)
|
| 859 |
+
if verbose_log.strip():
|
| 860 |
+
# Strip ANSI colour codes that CrewAI/Rich outputs
|
| 861 |
+
import re as _re2
|
| 862 |
+
clean = _re2.sub(r'\x1b\[[0-9;]*m', '', verbose_log)
|
| 863 |
+
lines += ["", "---", "", "## Full Agent Transcript", "", "```", clean.strip(), "```"]
|
| 864 |
+
elif step_log:
|
| 865 |
+
lines += ["", "---", "", "## Agent Interaction Log", ""]
|
| 866 |
+
lines += step_log
|
| 867 |
+
|
| 868 |
+
tmp = tempfile.NamedTemporaryFile(
|
| 869 |
+
delete=False, suffix=".md", mode="w",
|
| 870 |
+
encoding="utf-8", prefix="odin_report_"
|
| 871 |
+
)
|
| 872 |
+
tmp.write("\n".join(lines))
|
| 873 |
+
tmp.close()
|
| 874 |
+
return tmp.name
|
| 875 |
+
|
| 876 |
+
|
| 877 |
+
def _embed_charts_inline(chart_paths: list):
|
| 878 |
+
"""
|
| 879 |
+
Embed all charts directly in the chat message as HTML.
|
| 880 |
+
Priority: interactive HTML srcdoc iframe > static PNG base64.
|
| 881 |
+
Returns (inline_html: str, "") — second value kept for API compat.
|
| 882 |
+
"""
|
| 883 |
+
import base64 as _b64
|
| 884 |
+
parts = []
|
| 885 |
+
stems_done = set()
|
| 886 |
+
|
| 887 |
+
def _chart_label(stem):
|
| 888 |
+
return stem.replace("_", " ").title()
|
| 889 |
+
|
| 890 |
+
def _wrap(name, inner):
|
| 891 |
+
return (
|
| 892 |
+
f'<div style="margin:18px 0 10px">'
|
| 893 |
+
f'<div style="color:#10b981;font-size:0.66em;font-family:\'Share Tech Mono\',monospace;'
|
| 894 |
+
f'text-transform:uppercase;letter-spacing:1.5px;margin-bottom:6px;'
|
| 895 |
+
f'display:flex;align-items:center;gap:6px">'
|
| 896 |
+
f'<span style="opacity:.5">▬</span> {name}</div>'
|
| 897 |
+
f'{inner}</div>'
|
| 898 |
+
)
|
| 899 |
+
|
| 900 |
+
# Build a stem → {html, png} map so we can pick HTML first
|
| 901 |
+
by_stem: dict = {}
|
| 902 |
+
for cp in chart_paths:
|
| 903 |
+
p = Path(cp)
|
| 904 |
+
if p.exists():
|
| 905 |
+
by_stem.setdefault(p.stem, {})[p.suffix] = p
|
| 906 |
+
|
| 907 |
+
for stem, files in by_stem.items():
|
| 908 |
+
if stem in stems_done:
|
| 909 |
+
continue
|
| 910 |
+
name = _chart_label(stem)
|
| 911 |
+
if ".html" in files:
|
| 912 |
+
stems_done.add(stem)
|
| 913 |
+
try:
|
| 914 |
+
# Use base64 data URI — avoids ALL newline/quote escaping issues with srcdoc
|
| 915 |
+
raw = files[".html"].read_bytes()
|
| 916 |
+
b64_html = _b64.b64encode(raw).decode()
|
| 917 |
+
inner = (
|
| 918 |
+
f'<div style="border-radius:6px;border:1px solid #1e293b;overflow:hidden">'
|
| 919 |
+
f'<iframe src="data:text/html;base64,{b64_html}" width="100%" height="480" '
|
| 920 |
+
f'frameborder="0" style="display:block;background:#030712" sandbox="allow-scripts"></iframe></div>'
|
| 921 |
+
)
|
| 922 |
+
parts.append(_wrap(name, inner))
|
| 923 |
+
except Exception:
|
| 924 |
+
pass
|
| 925 |
+
elif ".png" in files:
|
| 926 |
+
stems_done.add(stem)
|
| 927 |
+
try:
|
| 928 |
+
b64 = _b64.b64encode(files[".png"].read_bytes()).decode()
|
| 929 |
+
inner = (
|
| 930 |
+
f'<img src="data:image/png;base64,{b64}" '
|
| 931 |
+
f'style="width:100%;border-radius:6px;border:1px solid #1e293b;display:block"/>'
|
| 932 |
+
)
|
| 933 |
+
parts.append(_wrap(name, inner))
|
| 934 |
+
except Exception:
|
| 935 |
+
pass
|
| 936 |
+
|
| 937 |
+
return "".join(parts), "" # second value empty — all charts are now inline
|
| 938 |
+
|
| 939 |
+
|
| 940 |
+
def clear_session():
|
| 941 |
+
return ([], gr.update(value=""),
|
| 942 |
+
render_hud(_EMPTY_HUD_STATE), gr.update(value=""),
|
| 943 |
+
gr.update(value=""), None)
|
| 944 |
+
|
| 945 |
+
|
| 946 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 947 |
+
# BUILD APP
|
| 948 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 949 |
+
|
| 950 |
+
def build_app():
|
| 951 |
+
with gr.Blocks(title="Odin — Drilling Intelligence") as app:
|
| 952 |
+
|
| 953 |
+
answer_state = gr.State(None) # holds export payload dict
|
| 954 |
+
|
| 955 |
+
# ── Header ──────────────────────────────────────────────────────────
|
| 956 |
+
with gr.Row(elem_id="odin-header"):
|
| 957 |
+
gr.HTML(
|
| 958 |
+
'<div class="odin-logo-wrap">'
|
| 959 |
+
'<span class="odin-rune">ODIN</span>'
|
| 960 |
+
'<div class="odin-divider"></div>'
|
| 961 |
+
'<div class="odin-wordmark">'
|
| 962 |
+
'<strong>Drilling Intelligence System</strong><br>'
|
| 963 |
+
'SPE GCS 2026 · Volve Field'
|
| 964 |
+
'</div>'
|
| 965 |
+
'</div>'
|
| 966 |
+
'<div class="odin-stats">'
|
| 967 |
+
'<div class="odin-stat"><span>23</span>Wells</div>'
|
| 968 |
+
'<div class="odin-stat"><span>32K+</span>DDR Records</div>'
|
| 969 |
+
'<div class="odin-stat"><span>55K+</span>WITSML Rows</div>'
|
| 970 |
+
'</div>'
|
| 971 |
+
)
|
| 972 |
+
clear_btn = gr.Button("Clear", size="sm", variant="secondary", min_width=70)
|
| 973 |
+
export_btn = gr.Button("Export MD", size="sm", variant="primary", min_width=100)
|
| 974 |
+
|
| 975 |
+
# ── Main Content ─────────────────────────────────────────────────────
|
| 976 |
+
with gr.Row():
|
| 977 |
+
# ── LEFT: Chat ───────────────────────────────────────────────────
|
| 978 |
+
with gr.Column(scale=7, elem_id="chat-col"):
|
| 979 |
+
chatbot = gr.Chatbot(
|
| 980 |
+
value=[],
|
| 981 |
+
show_label=False,
|
| 982 |
+
elem_classes=["chatbot-wrap"],
|
| 983 |
+
height=560,
|
| 984 |
+
render_markdown=True,
|
| 985 |
+
buttons=["copy"],
|
| 986 |
+
sanitize_html=False,
|
| 987 |
+
)
|
| 988 |
+
# Inline chart area: HTML-only charts (no PNG) fall back here
|
| 989 |
+
chart_area = gr.HTML(value="", elem_id="chart-area")
|
| 990 |
+
export_file = gr.HTML(value="", visible=True, elem_id="export-file")
|
| 991 |
+
with gr.Row(elem_id="input-zone"):
|
| 992 |
+
msg_input = gr.Textbox(
|
| 993 |
+
show_label=False,
|
| 994 |
+
placeholder="Ask about drilling phases, NPT, ROP, BHA performance, or well comparisons…",
|
| 995 |
+
scale=9, lines=1, max_lines=4, elem_id="msg-input",
|
| 996 |
+
)
|
| 997 |
+
send_btn = gr.Button("Send ⚡", variant="primary", scale=1,
|
| 998 |
+
min_width=90, elem_id="send-btn")
|
| 999 |
+
|
| 1000 |
+
# ── RIGHT: Tabs ───────────────────────────────────────────────────
|
| 1001 |
+
with gr.Column(scale=3, elem_id="right-panel"):
|
| 1002 |
+
with gr.Tabs():
|
| 1003 |
+
|
| 1004 |
+
# ── Tab 1: Suggested Prompts ──────────────────────────────
|
| 1005 |
+
with gr.TabItem("💡 Prompts", id="tab-prompts"):
|
| 1006 |
+
p_buttons = []
|
| 1007 |
+
with gr.Column(elem_classes=["prompts-scroll"]):
|
| 1008 |
+
gr.HTML('<div class="prompt-hint">// SELECT QUERY · PRESS SEND ⚡</div>')
|
| 1009 |
+
for p in SUGGESTED_PROMPTS:
|
| 1010 |
+
btn = gr.Button(
|
| 1011 |
+
value=p, size="sm",
|
| 1012 |
+
variant="secondary",
|
| 1013 |
+
elem_classes=["p-btn"],
|
| 1014 |
+
)
|
| 1015 |
+
p_buttons.append((btn, p))
|
| 1016 |
+
|
| 1017 |
+
# ── Tab 2: Agent HUD ──────────────────────────────────────
|
| 1018 |
+
with gr.TabItem("🛰️ HUD", id="tab-hud"):
|
| 1019 |
+
hud_html = gr.HTML(value=render_hud(_EMPTY_HUD_STATE))
|
| 1020 |
+
|
| 1021 |
+
# ── Outputs list (order must match generator yields) ─────────────────
|
| 1022 |
+
_outs = [chatbot, msg_input, hud_html, chart_area, answer_state, export_file]
|
| 1023 |
+
|
| 1024 |
+
# ── Event Wiring ──────────────────────────────────────────────────────
|
| 1025 |
+
send_btn.click(fn=chat_response, inputs=[msg_input, chatbot], outputs=_outs)
|
| 1026 |
+
msg_input.submit(fn=chat_response, inputs=[msg_input, chatbot], outputs=_outs)
|
| 1027 |
+
|
| 1028 |
+
# Prompt buttons: click → fill textbox
|
| 1029 |
+
for btn, p_text in p_buttons:
|
| 1030 |
+
btn.click(fn=lambda pt=p_text: pt, inputs=[], outputs=[msg_input])
|
| 1031 |
+
|
| 1032 |
+
# Clear — also wipe chart area and export link
|
| 1033 |
+
def _clear():
|
| 1034 |
+
return ([], gr.update(value=""), render_hud(_EMPTY_HUD_STATE),
|
| 1035 |
+
gr.update(value=""),
|
| 1036 |
+
gr.update(value="", visible=True), None)
|
| 1037 |
+
clear_btn.click(fn=_clear, inputs=[],
|
| 1038 |
+
outputs=[chatbot, msg_input, hud_html, chart_area, export_file, answer_state])
|
| 1039 |
+
|
| 1040 |
+
# Export — JS-only click: the download link is pre-rendered when the answer arrives.
|
| 1041 |
+
# No Python fn needed, no queue, fires instantly.
|
| 1042 |
+
export_btn.click(
|
| 1043 |
+
fn=None, inputs=[], outputs=[],
|
| 1044 |
+
js="() => { const a = document.getElementById('odin-dl'); if(a) a.click(); else alert('Run a query first to generate the report.'); }"
|
| 1045 |
+
)
|
| 1046 |
+
|
| 1047 |
+
return app
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
# ────────────────────────────────���────────────────────────────────────────────
|
| 1051 |
+
# ENTRY POINT
|
| 1052 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 1053 |
+
|
| 1054 |
+
if __name__ == "__main__":
|
| 1055 |
+
base_proj_dir = Path(__file__).resolve().parents[2]
|
| 1056 |
+
figures_dir = base_proj_dir / "outputs" / "figures"
|
| 1057 |
+
figures_dir.mkdir(parents=True, exist_ok=True)
|
| 1058 |
+
|
| 1059 |
+
theme = gr.themes.Soft(
|
| 1060 |
+
primary_hue="emerald",
|
| 1061 |
+
secondary_hue="slate",
|
| 1062 |
+
neutral_hue="slate",
|
| 1063 |
+
font=gr.themes.GoogleFont("Inter"),
|
| 1064 |
+
)
|
| 1065 |
+
app = build_app()
|
| 1066 |
+
app.launch(
|
| 1067 |
+
server_name="0.0.0.0",
|
| 1068 |
+
server_port=7860,
|
| 1069 |
+
share=False,
|
| 1070 |
+
allowed_paths=[str(figures_dir)],
|
| 1071 |
+
theme=theme,
|
| 1072 |
+
css=CUSTOM_CSS,
|
| 1073 |
+
)
|
src/agents/crew.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
crew.py
|
| 3 |
+
-------
|
| 4 |
+
Defines the multi-agent CrewAI Team for the SPE GCS 2026 ML Challenge.
|
| 5 |
+
The Crew is triggered ONLY when deep reasoning or data aggregation is required.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
import logging
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
from crewai import Agent, Task, Crew, Process, LLM
|
| 13 |
+
|
| 14 |
+
# ── Transparent 429 retry patch for the native Gemini provider ────────────────
|
| 15 |
+
# CrewAI 1.10 uses GeminiCompletion (google-genai SDK) when litellm is absent.
|
| 16 |
+
# The provider has NO rate-limit retry — a 429 kills the task immediately.
|
| 17 |
+
# We patch _call_api once at import time so every LLM call auto-retries on 429.
|
| 18 |
+
def _patch_gemini_retry():
|
| 19 |
+
"""
|
| 20 |
+
Monkey-patch GeminiCompletion._handle_completion to transparently sleep
|
| 21 |
+
and retry on 429 / RESOURCE_EXHAUSTED without surfacing failures to CrewAI.
|
| 22 |
+
Delays: 10s → 20s → 40s → 60s (4 retries, max ~130s total wait).
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
| 26 |
+
from google.genai.errors import APIError as _GeminiAPIError
|
| 27 |
+
|
| 28 |
+
_orig_handle = GeminiCompletion._handle_completion
|
| 29 |
+
_patch_log = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
def _retrying_handle_completion(self, *args, **kwargs):
|
| 32 |
+
_delays = [10, 20, 40, 60]
|
| 33 |
+
last_exc = None
|
| 34 |
+
for attempt, wait in enumerate([0] + _delays):
|
| 35 |
+
if wait:
|
| 36 |
+
_patch_log.warning(
|
| 37 |
+
f"[Gemini 429] Rate limit — sleeping {wait}s "
|
| 38 |
+
f"(attempt {attempt+1}/{len(_delays)+1})"
|
| 39 |
+
)
|
| 40 |
+
time.sleep(wait)
|
| 41 |
+
try:
|
| 42 |
+
return _orig_handle(self, *args, **kwargs)
|
| 43 |
+
except _GeminiAPIError as e:
|
| 44 |
+
if e.code in (429, 503) or "RESOURCE_EXHAUSTED" in str(e):
|
| 45 |
+
last_exc = e
|
| 46 |
+
continue
|
| 47 |
+
raise
|
| 48 |
+
except Exception:
|
| 49 |
+
raise
|
| 50 |
+
raise last_exc
|
| 51 |
+
|
| 52 |
+
GeminiCompletion._handle_completion = _retrying_handle_completion
|
| 53 |
+
logging.getLogger(__name__).info(
|
| 54 |
+
"GeminiCompletion._handle_completion patched — 429 auto-retry active."
|
| 55 |
+
)
|
| 56 |
+
except Exception as _patch_err:
|
| 57 |
+
logging.getLogger(__name__).warning(
|
| 58 |
+
f"Could not patch GeminiCompletion for 429 retry: {_patch_err}"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
_patch_gemini_retry()
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _patch_max_iter_fallback():
|
| 65 |
+
"""
|
| 66 |
+
Patch handle_max_iterations_exceeded so that when the forced-summary LLM
|
| 67 |
+
call returns None/empty (often due to oversized context after many tool
|
| 68 |
+
calls), we return a graceful fallback string instead of raising ValueError.
|
| 69 |
+
Without this patch a max_iter breach always crashes the entire crew.
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
import crewai.agents.crew_agent_executor as _exec_mod
|
| 73 |
+
from crewai.utilities.agent_utils import handle_max_iterations_exceeded as _orig_hmie
|
| 74 |
+
_patch_log = logging.getLogger(__name__)
|
| 75 |
+
|
| 76 |
+
def _safe_hmie(*args, **kwargs):
|
| 77 |
+
try:
|
| 78 |
+
result = _orig_hmie(*args, **kwargs)
|
| 79 |
+
return result
|
| 80 |
+
except ValueError as e:
|
| 81 |
+
if "None or empty" in str(e):
|
| 82 |
+
_patch_log.warning(
|
| 83 |
+
"[CrewAI] handle_max_iterations_exceeded returned empty "
|
| 84 |
+
"— substituting graceful fallback to prevent crew crash."
|
| 85 |
+
)
|
| 86 |
+
return (
|
| 87 |
+
"I retrieved the data from the available datasets but reached the "
|
| 88 |
+
"iteration limit while correlating the findings. "
|
| 89 |
+
"The tool outputs above contain the raw numerical results. "
|
| 90 |
+
"Please ask a more focused question (e.g., one specific metric or one well) "
|
| 91 |
+
"for a complete synthesized answer."
|
| 92 |
+
)
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
# Patch both the module reference and the executor's local import
|
| 96 |
+
import crewai.utilities.agent_utils as _au
|
| 97 |
+
_au.handle_max_iterations_exceeded = _safe_hmie
|
| 98 |
+
# The executor imports it at module level — patch the executor's namespace too
|
| 99 |
+
if hasattr(_exec_mod, 'handle_max_iterations_exceeded'):
|
| 100 |
+
_exec_mod.handle_max_iterations_exceeded = _safe_hmie
|
| 101 |
+
_patch_log.info(
|
| 102 |
+
"handle_max_iterations_exceeded patched — empty-response fallback active."
|
| 103 |
+
)
|
| 104 |
+
except Exception as _e:
|
| 105 |
+
logging.getLogger(__name__).warning(
|
| 106 |
+
f"Could not patch handle_max_iterations_exceeded: {_e}"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
_patch_max_iter_fallback()
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Schema-aware structured data tools (replace fragile Python REPL)
|
| 114 |
+
from src.agents.data_tools import (
|
| 115 |
+
DataInventoryTool,
|
| 116 |
+
DDRQueryTool,
|
| 117 |
+
WITSMLAnalystTool,
|
| 118 |
+
CrossWellCompareTool,
|
| 119 |
+
EDMTechnicalTool,
|
| 120 |
+
PythonTool,
|
| 121 |
+
)
|
| 122 |
+
# Vector search tools for qualitative knowledge
|
| 123 |
+
from src.agents.tools import IADC_SearchTool, VolveHistory_SearchTool
|
| 124 |
+
|
| 125 |
+
load_dotenv()
|
| 126 |
+
log = logging.getLogger(__name__)
|
| 127 |
+
|
| 128 |
+
# ── Dynamic Model Selection ───────────────────────────────────────────────────
|
| 129 |
+
MODEL_NAME = os.environ.get("GEMINI_MODEL", "gemini/gemini-3.1-flash-lite-preview")
|
| 130 |
+
API_KEY = os.environ.get("GOOGLE_API_KEY")
|
| 131 |
+
os.environ["GEMINI_API_KEY"] = API_KEY # Required for liteLLM underlying CrewAI
|
| 132 |
+
|
| 133 |
+
# ── Rate limit constants (Gemini flash-lite-preview free tier) ────────────────
|
| 134 |
+
# 15 RPM / 250K TPM / 500 RPD (TPM is never hit; RPM is the binding constraint)
|
| 135 |
+
# Lean (2-task): ~6 LLM calls. Full (4-task): ~10 calls.
|
| 136 |
+
_INTER_TASK_DELAY_S = 2 # seconds between task completions (was 4)
|
| 137 |
+
_TASK_RETRY_DELAYS = [10, 20, 40] # exponential back-off on 429 (s)
|
| 138 |
+
|
| 139 |
+
# ── Safe LLM Configuration ───────────────────────────────────────────────────
|
| 140 |
+
secure_llm = LLM(
|
| 141 |
+
model=MODEL_NAME,
|
| 142 |
+
api_key=API_KEY,
|
| 143 |
+
max_tokens=8192, # restored — 4096 caused empty responses on complex summaries
|
| 144 |
+
temperature=0.2,
|
| 145 |
+
num_retries=5,
|
| 146 |
+
timeout=180
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# ── Agent Factories ───────────────────────────────────────────────────────────
|
| 150 |
+
|
| 151 |
+
def get_prompt(filename: str) -> str:
|
| 152 |
+
path = Path(__file__).resolve().parents[2] / "tests" / "prompts" / filename
|
| 153 |
+
try:
|
| 154 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 155 |
+
return f.read()
|
| 156 |
+
except FileNotFoundError:
|
| 157 |
+
log.warning(f"Prompt file {filename} not found, using generic fallback.")
|
| 158 |
+
return "You are an AI assistant."
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def create_data_analyst():
|
| 162 |
+
return Agent(
|
| 163 |
+
role="Drilling Data Analyst",
|
| 164 |
+
goal="Retrieve, correlate, and analyze exact numerical data from DDR and WITSML datasets.",
|
| 165 |
+
backstory=get_prompt("analyst_prompt.txt"),
|
| 166 |
+
tools=[DataInventoryTool(), DDRQueryTool(), WITSMLAnalystTool(), CrossWellCompareTool(), EDMTechnicalTool(), PythonTool()],
|
| 167 |
+
llm=secure_llm,
|
| 168 |
+
allow_delegation=True,
|
| 169 |
+
max_iter=10 # headroom for multi-well queries; 6 was too low when agent makes 4+ tool calls
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def create_history_agent():
|
| 174 |
+
return Agent(
|
| 175 |
+
role="Volve Campaign Historian",
|
| 176 |
+
goal="Find qualitative context from the Daily Drilling Report text for events found by the Data Analyst.",
|
| 177 |
+
backstory=get_prompt("historian_prompt.txt"),
|
| 178 |
+
tools=[VolveHistory_SearchTool()],
|
| 179 |
+
llm=secure_llm,
|
| 180 |
+
allow_delegation=True,
|
| 181 |
+
max_iter=3
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def create_engineer_lead():
|
| 186 |
+
return Agent(
|
| 187 |
+
role="Lead Drilling Engineer",
|
| 188 |
+
goal="Synthesize the Analyst's data and Historian's context into a professional Markdown report.",
|
| 189 |
+
backstory=get_prompt("lead_prompt.txt"),
|
| 190 |
+
tools=[IADC_SearchTool()],
|
| 191 |
+
llm=secure_llm,
|
| 192 |
+
allow_delegation=True,
|
| 193 |
+
max_iter=3
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def create_auditor_agent():
|
| 198 |
+
return Agent(
|
| 199 |
+
role="Rig Operations Auditor",
|
| 200 |
+
goal="Audit the findings of the Analyst and Historian for technical consistency and hidden statistical patterns.",
|
| 201 |
+
backstory=get_prompt("auditor_prompt.txt"),
|
| 202 |
+
tools=[DataInventoryTool(), IADC_SearchTool(), VolveHistory_SearchTool(), PythonTool()],
|
| 203 |
+
llm=secure_llm,
|
| 204 |
+
allow_delegation=True,
|
| 205 |
+
max_iter=3
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ── Request classifier ────────────────────────────────────────────────────────
|
| 210 |
+
|
| 211 |
+
def _is_lean_request(question: str) -> bool:
|
| 212 |
+
"""
|
| 213 |
+
Returns True for chart/visualization and simple single-source queries.
|
| 214 |
+
These go through a 2-task crew (analysis→synthesis only), skipping
|
| 215 |
+
KB grounding, Historian, and Auditor to stay well within the 15 RPM budget.
|
| 216 |
+
|
| 217 |
+
Always returns False (full crew) for questions requiring historical narrative,
|
| 218 |
+
cross-well comparison, lessons learned, root cause, or risk assessment.
|
| 219 |
+
"""
|
| 220 |
+
q = question.lower()
|
| 221 |
+
# Full crew only for questions that genuinely need narrative context or cross-well synthesis.
|
| 222 |
+
# Data questions (even NPT) are lean — DDRQueryTool already returns activity codes + comments.
|
| 223 |
+
full_kw = [
|
| 224 |
+
'lessons learned', 'lessons from', 'campaign summary', 'what happened',
|
| 225 |
+
'explain why', 'root cause', 'why did', 'compare across', 'comparison between wells',
|
| 226 |
+
'recommend', 'recommendation', 'predict', 'risk assessment',
|
| 227 |
+
'handover', 'handoff summary', 'give me a summary of the campaign',
|
| 228 |
+
]
|
| 229 |
+
if any(kw in q for kw in full_kw):
|
| 230 |
+
return False
|
| 231 |
+
lean_kw = [
|
| 232 |
+
'chart', 'plot', 'graph', 'visualize', 'days vs depth', 'generate a',
|
| 233 |
+
'draw', 'how many', 'what is the average', 'list the', 'show me the',
|
| 234 |
+
'compar', # catches compare/comparison → uses CrossWellCompareTool (1 call vs 6+)
|
| 235 |
+
]
|
| 236 |
+
return any(kw in q for kw in lean_kw)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# ── Shared crew infrastructure ────────────────────────────────────────────────
|
| 240 |
+
|
| 241 |
+
def _build_shared(question: str, event_queue):
|
| 242 |
+
"""Create shared callbacks and agent instances."""
|
| 243 |
+
def step_callback(step):
|
| 244 |
+
agent_name = "Agent"
|
| 245 |
+
thought = ""
|
| 246 |
+
tool = None
|
| 247 |
+
tool_input = ""
|
| 248 |
+
try:
|
| 249 |
+
if hasattr(step, 'agent'): agent_name = step.agent
|
| 250 |
+
if hasattr(step, 'tool'): tool = step.tool
|
| 251 |
+
if hasattr(step, 'tool_input'): tool_input = step.tool_input
|
| 252 |
+
if hasattr(step, 'thought'): thought = step.thought
|
| 253 |
+
elif hasattr(step, 'text'): thought = step.text
|
| 254 |
+
if isinstance(step, dict):
|
| 255 |
+
agent_name = step.get('agent', agent_name)
|
| 256 |
+
thought = step.get('thought', step.get('text', ''))
|
| 257 |
+
tool = step.get('tool')
|
| 258 |
+
tool_input = step.get('tool_input', '')
|
| 259 |
+
if thought and len(thought) > 5:
|
| 260 |
+
event_queue.put({"event": "log", "icon": "🧠", "name": agent_name,
|
| 261 |
+
"status": "Thought", "detail": thought[:200],
|
| 262 |
+
"detail_full": thought, "is_dialogue": False})
|
| 263 |
+
if tool:
|
| 264 |
+
if tool in ["Ask question to co-worker", "Delegate work to co-worker"]:
|
| 265 |
+
event_queue.put({"event": "log", "icon": "💬", "name": agent_name,
|
| 266 |
+
"status": f"🗣️ Interaction: {tool}",
|
| 267 |
+
"detail": f"Message: {tool_input}" if tool_input else "",
|
| 268 |
+
"is_dialogue": True})
|
| 269 |
+
else:
|
| 270 |
+
ti_str = str(tool_input) if tool_input else ""
|
| 271 |
+
event_queue.put({"event": "log", "icon": "🔧", "name": agent_name,
|
| 272 |
+
"status": f"Action: {tool}",
|
| 273 |
+
"detail": f"Input: {ti_str[:120]}" if ti_str else "",
|
| 274 |
+
"detail_full": f"Tool: {tool}\nInput:\n{ti_str}" if ti_str else f"Tool: {tool}",
|
| 275 |
+
"is_dialogue": False})
|
| 276 |
+
except Exception as e:
|
| 277 |
+
event_queue.put({"event": "log", "icon": "⚠️", "name": "System",
|
| 278 |
+
"status": "Callback Error", "detail": str(e), "is_dialogue": False})
|
| 279 |
+
|
| 280 |
+
def task_callback(task_output):
|
| 281 |
+
agent_role = getattr(task_output, 'agent', 'Agent')
|
| 282 |
+
summary = ""
|
| 283 |
+
raw_output = ""
|
| 284 |
+
if hasattr(task_output, 'raw') and task_output.raw:
|
| 285 |
+
raw_output = str(task_output.raw)
|
| 286 |
+
summary = raw_output.replace('\n', ' ')[:120] + "..."
|
| 287 |
+
else:
|
| 288 |
+
summary = "Passing analysis to the next step..."
|
| 289 |
+
event_queue.put({"event": "log", "icon": "📋", "name": agent_role,
|
| 290 |
+
"status": "🗣️ Interaction: Handoff Complete",
|
| 291 |
+
"detail": summary, "detail_full": raw_output or summary, "is_dialogue": True})
|
| 292 |
+
# The Data Analyst is the heaviest RPM consumer (up to 4 tool calls × LLM).
|
| 293 |
+
# Give a longer cooling window specifically after it to protect the next agent.
|
| 294 |
+
role_str = str(agent_role)
|
| 295 |
+
delay = 6 if "Analyst" in role_str else _INTER_TASK_DELAY_S
|
| 296 |
+
event_queue.put({"event": "log", "icon": "⏳", "name": "Rate Limiter",
|
| 297 |
+
"status": f"Cooling {delay}s after {role_str.split()[-1]} task…",
|
| 298 |
+
"detail": "Respecting Gemini 15 RPM budget", "is_dialogue": False})
|
| 299 |
+
time.sleep(delay)
|
| 300 |
+
|
| 301 |
+
analyst = create_data_analyst()
|
| 302 |
+
historian = create_history_agent()
|
| 303 |
+
auditor = create_auditor_agent()
|
| 304 |
+
lead = create_engineer_lead()
|
| 305 |
+
|
| 306 |
+
for agent in [analyst, historian, auditor, lead]:
|
| 307 |
+
agent.step_callback = step_callback
|
| 308 |
+
|
| 309 |
+
return analyst, historian, auditor, lead, step_callback, task_callback
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _run_crew_thread(crew, event_queue):
|
| 313 |
+
"""Retry-aware crew kickoff with exponential back-off on 429.
|
| 314 |
+
NOTE: stdout is already redirected by run_aggregation_loop before Crew construction
|
| 315 |
+
so that the Rich Console (created at Crew.__init__ time) writes to the capture buffer.
|
| 316 |
+
"""
|
| 317 |
+
import traceback
|
| 318 |
+
last_exc = None
|
| 319 |
+
for attempt, delay in enumerate([0] + _TASK_RETRY_DELAYS):
|
| 320 |
+
if delay:
|
| 321 |
+
event_queue.put({"event": "log", "icon": "⏳", "name": "Rate Limiter",
|
| 322 |
+
"status": f"429 back-off — waiting {delay}s (attempt {attempt+1}/4)…",
|
| 323 |
+
"detail": "Gemini RPM limit hit, retrying shortly", "is_dialogue": False})
|
| 324 |
+
time.sleep(delay)
|
| 325 |
+
try:
|
| 326 |
+
res = crew.kickoff()
|
| 327 |
+
event_queue.put({"event": "final_answer", "answer": res.raw})
|
| 328 |
+
event_queue.put(None)
|
| 329 |
+
return
|
| 330 |
+
except Exception as e:
|
| 331 |
+
last_exc = e
|
| 332 |
+
err_str = str(e).lower()
|
| 333 |
+
tb = traceback.format_exc()
|
| 334 |
+
log.error(f"Crew attempt {attempt+1} failed: {type(e).__name__}: {e}\n{tb}")
|
| 335 |
+
# Surface the exception detail to the UI as a log event
|
| 336 |
+
event_queue.put({"event": "log", "icon": "🔴", "name": "Crew Error",
|
| 337 |
+
"status": f"{type(e).__name__}: {str(e)[:120]}",
|
| 338 |
+
"detail": tb.splitlines()[-3] if tb else "",
|
| 339 |
+
"is_dialogue": False})
|
| 340 |
+
if "429" not in err_str and "rate" not in err_str and "quota" not in err_str:
|
| 341 |
+
break
|
| 342 |
+
event_queue.put({"event": "error", "message": f"{type(last_exc).__name__}: {last_exc}"})
|
| 343 |
+
event_queue.put(None)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# ── Aggregation Loop ──────────────────────────────────────────────────────────
|
| 347 |
+
|
| 348 |
+
def run_aggregation_loop(question: str):
|
| 349 |
+
"""
|
| 350 |
+
Generator yielding status logs then a final_answer event.
|
| 351 |
+
Routes to a lean 2-task crew (chart/simple) or full 4-task crew (deep analysis).
|
| 352 |
+
Lean crew: ~6 LLM calls, ~35-50s. Full crew: ~10 calls, ~75-90s.
|
| 353 |
+
"""
|
| 354 |
+
from queue import Queue
|
| 355 |
+
import threading
|
| 356 |
+
|
| 357 |
+
event_queue = Queue()
|
| 358 |
+
lean = _is_lean_request(question)
|
| 359 |
+
|
| 360 |
+
mode_label = "LEAN (2-task)" if lean else "FULL (4-task)"
|
| 361 |
+
event_queue.put({"event": "log", "icon": "🔀", "name": "Router",
|
| 362 |
+
"status": f"Crew mode: {mode_label}",
|
| 363 |
+
"detail": "Lean = analysis+synthesis | Full adds grounding+context",
|
| 364 |
+
"is_dialogue": False})
|
| 365 |
+
|
| 366 |
+
analyst, historian, auditor, lead, step_callback, task_callback = \
|
| 367 |
+
_build_shared(question, event_queue)
|
| 368 |
+
|
| 369 |
+
# ── Task definitions ──────────────────────────────────────────────────────
|
| 370 |
+
|
| 371 |
+
# Comparison-specific vs general analysis description
|
| 372 |
+
_is_comparison = 'compar' in question.lower()
|
| 373 |
+
if _is_comparison:
|
| 374 |
+
_analyze_desc = (
|
| 375 |
+
f"The user asked: '{question}'\n\n"
|
| 376 |
+
"MANDATORY TOOL SEQUENCE — follow exactly, no deviations:\n"
|
| 377 |
+
"Step 1 (ONLY step): Call `CrossWell_Comparison` ONCE with all wells mentioned.\n"
|
| 378 |
+
" → This single call returns DDR + WITSML data for every well. NO other data tools are needed.\n"
|
| 379 |
+
"Step 2: Write your markdown answer immediately after receiving the CrossWell_Comparison result.\n"
|
| 380 |
+
" → Include a comparison table (ROP, NPT %, BHA runs) per well and per hole section.\n"
|
| 381 |
+
"PROHIBITED: Do NOT call data_inventory_inspector, DDR_Query, WITSML_Analyst, or python_interpreter.\n"
|
| 382 |
+
"NOTE: Translate any Norwegian text in tool output to English."
|
| 383 |
+
)
|
| 384 |
+
else:
|
| 385 |
+
_analyze_desc = (
|
| 386 |
+
f"The user asked: '{question}'\n\n"
|
| 387 |
+
"Retrieve and analyze data with the MINIMUM set of tools needed:\n"
|
| 388 |
+
" • Single-well data (phases, ROP, NPT)? → Use `DDR_Query` and/or `WITSML_Analyst`\n"
|
| 389 |
+
" • BHA / casing / formations? → Use `EDM_Technical_Query`\n"
|
| 390 |
+
" • Chart/visualization? → Use `python_interpreter` with load_ddr() / load_witsml() / days_vs_depth() helpers\n"
|
| 391 |
+
" NPT identification: always call df['activity_code'].value_counts().head(30) FIRST to see available codes,\n"
|
| 392 |
+
" then filter with df['activity_code'].str.upper().str.contains('NPT|WOW|WAIT|STUCK|PACK|FISH|CIRC|TEST|DELAY|BREAK', na=False)\n"
|
| 393 |
+
" • Skip data_inventory_inspector unless you genuinely don't know which wells exist.\n"
|
| 394 |
+
"Return tables, stats, and any chart file paths. Translate Norwegian text to English."
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
# ── LEAN: 2-task crew (analysis + synthesis only — no KB grounding step) ──
|
| 398 |
+
task_analyze_lean = Task(
|
| 399 |
+
description=_analyze_desc,
|
| 400 |
+
expected_output=(
|
| 401 |
+
"Markdown summary with exact numbers from tools. "
|
| 402 |
+
"Activity/stats table required. If a chart was generated, include the full file path."
|
| 403 |
+
),
|
| 404 |
+
agent=analyst,
|
| 405 |
+
context=[]
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
task_synth_lean = Task(
|
| 409 |
+
description=(
|
| 410 |
+
f"The user asked: '{question}'\n"
|
| 411 |
+
"Synthesize the Analyst's findings into a direct Odin response. "
|
| 412 |
+
"DO NOT call any tools — use only the context you already have. "
|
| 413 |
+
"CRITICAL: Do NOT mention crew members. Present findings natively as Odin. "
|
| 414 |
+
"CRITICAL: ABSOLUTELY NO email headers, no To/From/Subject, no memorandum structure."
|
| 415 |
+
),
|
| 416 |
+
expected_output="A direct, highly technical engineering response. No email headers.",
|
| 417 |
+
agent=lead,
|
| 418 |
+
context=[task_analyze_lean]
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
# ── FULL: 4-task crew (grounding + analysis + context + synthesis) ───────���
|
| 422 |
+
task_ground = Task(
|
| 423 |
+
description=(
|
| 424 |
+
f"Question: '{question}'\n"
|
| 425 |
+
"Search the Volve Campaign History DB for relevant background context on this topic. "
|
| 426 |
+
"Use `VolveHistory_SearchTool` ONLY (one call). "
|
| 427 |
+
"Provide a brief 'Contextual Brief' — key events, problems, or precedents relevant to the question."
|
| 428 |
+
),
|
| 429 |
+
expected_output="A concise contextual brief from the Volve operational history database.",
|
| 430 |
+
agent=lead
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
task_analyze_full = Task(
|
| 434 |
+
description=_analyze_desc,
|
| 435 |
+
expected_output=(
|
| 436 |
+
"Markdown summary with exact numbers from tools. "
|
| 437 |
+
"Activity/stats table required. If a chart was generated, include the full file path."
|
| 438 |
+
),
|
| 439 |
+
agent=analyst,
|
| 440 |
+
context=[task_ground]
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
task_context = Task(
|
| 444 |
+
description=(
|
| 445 |
+
f"The user asked: '{question}'\n"
|
| 446 |
+
"The Analyst found quantitative results (see context above). Do two things in ONE pass:\n"
|
| 447 |
+
"1. HISTORY: Use `VolveHistory_SearchTool` to find narrative context — events, incidents, or decisions "
|
| 448 |
+
"that explain the Analyst's numbers. Cite sources as [Volve-Hist-N].\n"
|
| 449 |
+
"2. STATS AUDIT: Using only the numbers already in context (no new tool calls), check Mean vs Median "
|
| 450 |
+
"for ROP/NPT. Note whether performance was consistent or outlier-dominated.\n"
|
| 451 |
+
"Combine both into a single 'Context & Verification' response."
|
| 452 |
+
),
|
| 453 |
+
expected_output=(
|
| 454 |
+
"Combined: (a) relevant historical events with source citations, "
|
| 455 |
+
"(b) quick statistical consistency note on the Analyst's key numbers."
|
| 456 |
+
),
|
| 457 |
+
agent=historian,
|
| 458 |
+
context=[task_analyze_full]
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
task_synth_full = Task(
|
| 462 |
+
description=(
|
| 463 |
+
f"The user asked: '{question}'\n"
|
| 464 |
+
"Synthesize all findings into a comprehensive Odin response. "
|
| 465 |
+
"DO NOT call any tools — use only the context you already have. "
|
| 466 |
+
"Weave in the quantitative results, historical context, and statistical insights naturally. "
|
| 467 |
+
"Include Evidence, Assumptions, and Confidence Level inline (not as separate sections unless asked). "
|
| 468 |
+
"CRITICAL: Do NOT mention crew members. Present all data natively as Odin. "
|
| 469 |
+
"CRITICAL: ABSOLUTELY NO email headers, no To/From/Subject, no formal memorandum structure."
|
| 470 |
+
),
|
| 471 |
+
expected_output="A direct, conversational yet highly technical engineering response. No email headers.",
|
| 472 |
+
agent=lead,
|
| 473 |
+
context=[task_analyze_full, task_context]
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
# ── Redirect stdout BEFORE Crew construction so the Rich Console writes to buffer ──
|
| 477 |
+
# CrewAI's verbose output uses a Rich Console created at Crew.__init__ time.
|
| 478 |
+
# If we redirect after construction, Console keeps the original stdout reference.
|
| 479 |
+
from io import StringIO
|
| 480 |
+
import sys as _sys
|
| 481 |
+
import re as _re_ansi
|
| 482 |
+
_stdout_buf = StringIO()
|
| 483 |
+
_orig_stdout = _sys.stdout
|
| 484 |
+
_sys.stdout = _stdout_buf
|
| 485 |
+
|
| 486 |
+
# ── Route to lean (2-task) or full (4-task) crew ──────────────────────────
|
| 487 |
+
try:
|
| 488 |
+
if lean:
|
| 489 |
+
crew = Crew(
|
| 490 |
+
agents=[analyst, lead],
|
| 491 |
+
tasks=[task_analyze_lean, task_synth_lean],
|
| 492 |
+
process=Process.sequential,
|
| 493 |
+
max_rpm=14,
|
| 494 |
+
verbose=True,
|
| 495 |
+
task_callback=task_callback,
|
| 496 |
+
step_callback=step_callback
|
| 497 |
+
)
|
| 498 |
+
else:
|
| 499 |
+
crew = Crew(
|
| 500 |
+
agents=[lead, analyst, historian],
|
| 501 |
+
tasks=[task_ground, task_analyze_full, task_context, task_synth_full],
|
| 502 |
+
process=Process.sequential,
|
| 503 |
+
max_rpm=10,
|
| 504 |
+
verbose=True,
|
| 505 |
+
task_callback=task_callback,
|
| 506 |
+
step_callback=step_callback
|
| 507 |
+
)
|
| 508 |
+
except Exception:
|
| 509 |
+
_sys.stdout = _orig_stdout
|
| 510 |
+
raise
|
| 511 |
+
|
| 512 |
+
def run_crew():
|
| 513 |
+
_run_crew_thread(crew, event_queue)
|
| 514 |
+
|
| 515 |
+
thread = threading.Thread(target=run_crew)
|
| 516 |
+
thread.start()
|
| 517 |
+
|
| 518 |
+
while True:
|
| 519 |
+
event = event_queue.get()
|
| 520 |
+
if event is None:
|
| 521 |
+
break
|
| 522 |
+
yield event
|
| 523 |
+
|
| 524 |
+
thread.join()
|
| 525 |
+
|
| 526 |
+
# ── Restore stdout and emit captured transcript ────────────────────────────
|
| 527 |
+
_sys.stdout = _orig_stdout
|
| 528 |
+
_raw_transcript = _stdout_buf.getvalue()
|
| 529 |
+
if _raw_transcript.strip():
|
| 530 |
+
# Strip ANSI escape codes (Rich colour markup)
|
| 531 |
+
_clean = _re_ansi.sub(r'\x1b\[[0-9;]*[mGKHF]', '', _raw_transcript)
|
| 532 |
+
yield {"event": "verbose_log", "content": _clean}
|
src/agents/data_tools.py
ADDED
|
@@ -0,0 +1,1141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
data_tools.py
|
| 3 |
+
-------------
|
| 4 |
+
Schema-aware, purpose-built tools for querying the Volve structured data.
|
| 5 |
+
|
| 6 |
+
These tools know the exact schema of each data source and use fuzzy matching
|
| 7 |
+
to handle typos or inconsistent well name formatting from users.
|
| 8 |
+
|
| 9 |
+
Available Tools:
|
| 10 |
+
1. DataInventoryTool - Lists all 23 wells and available data sources.
|
| 11 |
+
2. DDRQueryTool - Queries DDR activity logs for a named well with NPT focus.
|
| 12 |
+
3. WITSMLAnalystTool - Computes drilling stats (ROP/TQA/SPP/WOB) from WITSML CSVs.
|
| 13 |
+
4. CrossWellCompareTool - Compares key statistics across two wells side by side.
|
| 14 |
+
5. EDMTechnicalTool - Queries Technical data (BHA, Casing, Formations) from EDM.
|
| 15 |
+
6. PythonTool - Allows the analyst to perform custom Pandas/Matplotlib analysis.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import subprocess
|
| 19 |
+
import sys
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
import matplotlib
|
| 26 |
+
matplotlib.use('Agg') # headless backend - must be set before pyplot import
|
| 27 |
+
import matplotlib.pyplot as plt
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from crewai.tools import BaseTool
|
| 30 |
+
from dotenv import load_dotenv
|
| 31 |
+
|
| 32 |
+
load_dotenv()
|
| 33 |
+
|
| 34 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 35 |
+
DDR_DIR = Path(os.environ.get("DDR_DIR", str(BASE_DIR / "data" / "processed" / "ddr")))
|
| 36 |
+
WITSML_DIR = Path(os.environ.get("WITSML_DIR", str(BASE_DIR / "data" / "processed" / "witsml")))
|
| 37 |
+
EDM_DIR = BASE_DIR / "data" / "processed" / "edm"
|
| 38 |
+
OUTPUTS_DIR = BASE_DIR / "outputs" / "figures"
|
| 39 |
+
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# ── Helpers ────────────────────────────────────────────────────────────────────
|
| 42 |
+
|
| 43 |
+
def _normalize_well(name: str) -> str:
|
| 44 |
+
"""
|
| 45 |
+
Normalize a user-supplied well name to a canonical slug used in filenames.
|
| 46 |
+
e.g. 'NO 15/9-19 A', '15/9-19A', '15-9-19a', '15 9 19 a' → '15_9_19_A'
|
| 47 |
+
e.g. '15/9-F-1 C', '15/9 F 1C' → '15_9_F_1_C'
|
| 48 |
+
"""
|
| 49 |
+
s = name.strip().upper()
|
| 50 |
+
# Strip the 'NO ' prefix if present
|
| 51 |
+
s = re.sub(r'^NO\s+', '', s)
|
| 52 |
+
# Replace all separators (/, -, space) with single underscore
|
| 53 |
+
s = re.sub(r'[\s/\-]+', '_', s)
|
| 54 |
+
# Collapse multiple underscores
|
| 55 |
+
s = re.sub(r'_+', '_', s)
|
| 56 |
+
s = s.strip('_')
|
| 57 |
+
return s
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _fuzzy_find_well_file(desired: str, suffix: str = "_activities.csv") -> Path | None:
|
| 61 |
+
"""
|
| 62 |
+
Find the best-matching file in DDR_DIR for a given well name.
|
| 63 |
+
Uses normalised string similarity: exact match first, then longest common subsequence.
|
| 64 |
+
"""
|
| 65 |
+
target_slug = _normalize_well(desired)
|
| 66 |
+
|
| 67 |
+
candidates = list(DDR_DIR.glob(f"*{suffix}"))
|
| 68 |
+
|
| 69 |
+
# Step 1: Try exact slug match
|
| 70 |
+
for c in candidates:
|
| 71 |
+
stem_slug = _normalize_well(c.stem.replace(suffix.replace('.csv',''), ''))
|
| 72 |
+
if c.stem.upper() == (target_slug + suffix.replace('.csv', '')).upper():
|
| 73 |
+
return c
|
| 74 |
+
|
| 75 |
+
# Step 2: Exact slug prefix match (file stem starts with the target slug)
|
| 76 |
+
for c in candidates:
|
| 77 |
+
if c.stem.upper().startswith(target_slug.upper()):
|
| 78 |
+
return c
|
| 79 |
+
|
| 80 |
+
# Step 3: Target slug is contained in filename slug
|
| 81 |
+
for c in candidates:
|
| 82 |
+
if target_slug.upper() in c.stem.upper():
|
| 83 |
+
return c
|
| 84 |
+
|
| 85 |
+
# Step 4: Fuzzy token overlap - find file with most shared tokens
|
| 86 |
+
target_tokens = set(target_slug.split('_'))
|
| 87 |
+
best_score = 0
|
| 88 |
+
best_match = None
|
| 89 |
+
for c in candidates:
|
| 90 |
+
file_tokens = set(re.sub(r'_+', '_', c.stem.upper()).split('_'))
|
| 91 |
+
score = len(target_tokens & file_tokens)
|
| 92 |
+
if score > best_score:
|
| 93 |
+
best_score = score
|
| 94 |
+
best_match = c
|
| 95 |
+
|
| 96 |
+
return best_match if best_score >= 2 else None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
_PHASE_MAP = {
|
| 100 |
+
# (keyword in activity_code) → phase label
|
| 101 |
+
"drilling -- drill": "Rotary/Sliding Drilling",
|
| 102 |
+
"drilling -- trip": "Tripping (POOH/TIH)",
|
| 103 |
+
"drilling -- wiper": "Wiper Trip",
|
| 104 |
+
"drilling -- circulate": "Circulation/Conditioning",
|
| 105 |
+
"drilling -- circ": "Circulation/Conditioning",
|
| 106 |
+
"drilling -- condition": "Circulation/Conditioning",
|
| 107 |
+
"casing": "Casing/Liner Running",
|
| 108 |
+
"liner": "Casing/Liner Running",
|
| 109 |
+
"cement": "Cementing",
|
| 110 |
+
"logging": "Logging/Survey",
|
| 111 |
+
"wireline": "Logging/Survey",
|
| 112 |
+
"sidetrack": "Sidetrack/Remedial",
|
| 113 |
+
"whipstock": "Sidetrack/Remedial",
|
| 114 |
+
"milling": "Sidetrack/Remedial",
|
| 115 |
+
"fishing": "NPT – Fishing",
|
| 116 |
+
"stuck": "NPT – Stuck Pipe",
|
| 117 |
+
"repair": "NPT – Equipment Repair",
|
| 118 |
+
"wait": "NPT – Waiting/Weather",
|
| 119 |
+
"weather": "NPT – Waiting/Weather",
|
| 120 |
+
"npt": "NPT – General",
|
| 121 |
+
"bha": "BHA Change/Rig-Up",
|
| 122 |
+
"bit change": "BHA Change/Rig-Up",
|
| 123 |
+
"washout": "NPT – Washout/Losses",
|
| 124 |
+
"loss": "NPT – Washout/Losses",
|
| 125 |
+
"lcm": "NPT – Washout/Losses",
|
| 126 |
+
"trip": "Tripping (POOH/TIH)", # catch-all trip at end
|
| 127 |
+
"drill": "Rotary/Sliding Drilling", # catch-all drill at end
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _classify_phase(activity_code: str) -> str:
|
| 132 |
+
"""Map an activity_code string to a drilling phase label."""
|
| 133 |
+
if not isinstance(activity_code, str):
|
| 134 |
+
return "Other"
|
| 135 |
+
ac = activity_code.lower().strip()
|
| 136 |
+
for keyword, phase in _PHASE_MAP.items():
|
| 137 |
+
if keyword in ac:
|
| 138 |
+
return phase
|
| 139 |
+
return "Other"
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _list_all_wells() -> list[str]:
|
| 143 |
+
"""Return sorted list of canonical well names from DDR file stems."""
|
| 144 |
+
wells = []
|
| 145 |
+
for f in DDR_DIR.glob("*_activities.csv"):
|
| 146 |
+
if f.stem.startswith('_'):
|
| 147 |
+
continue # skip aggregate files
|
| 148 |
+
# Convert slug back to readable form e.g. 15_9_19_A → 15/9-19 A
|
| 149 |
+
stem = f.stem.replace('_activities', '')
|
| 150 |
+
# Only the last letter token is a well variant (A, B, C …)
|
| 151 |
+
readable = stem.replace('_', '/')
|
| 152 |
+
wells.append(readable)
|
| 153 |
+
return sorted(wells)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ── Tool 1: Data Inventory ─────────────────────────────────────────────────────
|
| 157 |
+
|
| 158 |
+
class DataInventoryTool(BaseTool):
|
| 159 |
+
name: str = "data_inventory_inspector"
|
| 160 |
+
description: str = (
|
| 161 |
+
"Use this tool FIRST when the user asks what wells or datasets are available, "
|
| 162 |
+
"or before any data query to confirm a well name exists. "
|
| 163 |
+
"Returns a structured inventory of all 23 Volve wells and the types of data "
|
| 164 |
+
"available (DDR activities, WITSML sensor logs, EDM metadata)."
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
def _run(self, query: str = "") -> str:
|
| 168 |
+
lines = ["## 📋 Volve Field – Available Data Inventory\n"]
|
| 169 |
+
|
| 170 |
+
# DDR wells
|
| 171 |
+
wells = _list_all_wells()
|
| 172 |
+
lines.append(f"### Daily Drilling Reports (DDR) — {len(wells)} Wells")
|
| 173 |
+
lines.append("Each well has: `_activities.csv` (activity time-log) and `_daily_summary.csv` (per-day totals).")
|
| 174 |
+
lines.append("**Available Wells:**")
|
| 175 |
+
for w in wells:
|
| 176 |
+
lines.append(f" - `{w}`")
|
| 177 |
+
|
| 178 |
+
# Global aggregate files
|
| 179 |
+
if (DDR_DIR / "_ddr_all_activities.csv").exists():
|
| 180 |
+
lines.append("\n**Global Aggregate File:** `_ddr_all_activities.csv` — all 23 wells merged (~32,000 rows)")
|
| 181 |
+
if (DDR_DIR / "_ddr_extraction_summary.csv").exists():
|
| 182 |
+
lines.append("**Summary File:** `_ddr_extraction_summary.csv` — one row per well with spud/completion dates")
|
| 183 |
+
|
| 184 |
+
# WITSML
|
| 185 |
+
witsml_files = list(WITSML_DIR.glob("*.csv"))
|
| 186 |
+
lines.append(f"\n### WITSML Sensor Logs — {len(witsml_files)} CSV files")
|
| 187 |
+
lines.append("Fields include: `ROP`, `RPM`, `WOB`, `SPPA` (standpipe pressure), `HKLD` (hookload), `TQA` (torque), depth, and more.")
|
| 188 |
+
witsml_wells = sorted(set(f.name.split('__')[0] for f in witsml_files if '__' in f.name))
|
| 189 |
+
lines.append(f"Wells with WITSML data: {', '.join(witsml_wells)}")
|
| 190 |
+
|
| 191 |
+
# EDM
|
| 192 |
+
edm_files = list(EDM_DIR.glob("*.csv")) if EDM_DIR.exists() else []
|
| 193 |
+
lines.append(f"\n### EDM (Engineering Data Model) — {len(edm_files)} tables")
|
| 194 |
+
lines.append("Includes: wellbore geometry, BHA components, survey stations, pore pressure, casing data.")
|
| 195 |
+
|
| 196 |
+
lines.append("\n---")
|
| 197 |
+
lines.append("💡 **Tip:** Use `DDR_Query` with a well name to get activity logs, or `WITSML_Analyst` for sensor-level stats.")
|
| 198 |
+
return "\n".join(lines)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# ── Tool 2: DDR Activity Query ─────────────────────────────────────────────────
|
| 202 |
+
|
| 203 |
+
class DDRQueryTool(BaseTool):
|
| 204 |
+
name: str = "DDR_Query"
|
| 205 |
+
description: str = (
|
| 206 |
+
"Query the Daily Drilling Report (DDR) activity log for a specific well. "
|
| 207 |
+
"Accepts any well name variant (e.g. '15/9-19 A', '15/9-F-1C', '15-9-F-1 C', typos OK). "
|
| 208 |
+
"Returns: a Markdown table of activities with dates, depths, activity codes, duration, and comments. "
|
| 209 |
+
"Also returns NPT (Non-Productive Time) summary and total drilled depth. "
|
| 210 |
+
"Input: well name as a string."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
def _run(self, well_name: str) -> str:
|
| 214 |
+
path = _fuzzy_find_well_file(well_name, "_activities.csv")
|
| 215 |
+
if path is None:
|
| 216 |
+
available = ", ".join(_list_all_wells()[:10]) + "..."
|
| 217 |
+
return (f"❌ Could not find DDR data for well `{well_name}`. "
|
| 218 |
+
f"Try: {available}")
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
df = pd.read_csv(path)
|
| 222 |
+
except Exception as e:
|
| 223 |
+
return f"Error reading file {path}: {e}"
|
| 224 |
+
|
| 225 |
+
total_rows = len(df)
|
| 226 |
+
matched_well = path.stem.replace('_activities', '')
|
| 227 |
+
|
| 228 |
+
# ── Basic stats ──
|
| 229 |
+
lines = [f"## DDR Activity Report — Well: `{matched_well}` (matched from `{well_name}`)\n"]
|
| 230 |
+
lines.append(f"**Total activity records:** {total_rows}")
|
| 231 |
+
|
| 232 |
+
# Duration totals
|
| 233 |
+
if 'duration_hours' in df.columns:
|
| 234 |
+
df['duration_hours'] = pd.to_numeric(df['duration_hours'], errors='coerce')
|
| 235 |
+
total_h = df['duration_hours'].sum()
|
| 236 |
+
lines.append(f"**Total logged time:** {total_h:.1f} hours ({total_h/24:.1f} days)")
|
| 237 |
+
|
| 238 |
+
# Depth range
|
| 239 |
+
if 'md_m' in df.columns:
|
| 240 |
+
df['md_m'] = pd.to_numeric(df['md_m'], errors='coerce')
|
| 241 |
+
lines.append(f"**Depth range:** {df['md_m'].min():.0f} m — {df['md_m'].max():.0f} m MD")
|
| 242 |
+
|
| 243 |
+
# ── Activity code breakdown ──
|
| 244 |
+
if 'activity_code' in df.columns:
|
| 245 |
+
act_counts = df.groupby('activity_code')['duration_hours'].sum().sort_values(ascending=False).head(12)
|
| 246 |
+
lines.append("\n### Top Activities by Time (hours)\n")
|
| 247 |
+
lines.append(act_counts.reset_index().rename(columns={'activity_code': 'Activity', 'duration_hours': 'Hours'}).to_markdown(index=False, floatfmt=".1f"))
|
| 248 |
+
|
| 249 |
+
# ── Drilling Phase Breakdown ──
|
| 250 |
+
if 'activity_code' in df.columns and 'duration_hours' in df.columns:
|
| 251 |
+
df['_phase'] = df['activity_code'].apply(_classify_phase)
|
| 252 |
+
phase_totals = (
|
| 253 |
+
df.groupby('_phase')['duration_hours']
|
| 254 |
+
.sum()
|
| 255 |
+
.sort_values(ascending=False)
|
| 256 |
+
)
|
| 257 |
+
total_phase_h = phase_totals.sum()
|
| 258 |
+
if total_phase_h > 0:
|
| 259 |
+
phase_df = phase_totals.reset_index()
|
| 260 |
+
phase_df.columns = ['Phase', 'Hours']
|
| 261 |
+
phase_df['%'] = (phase_df['Hours'] / total_phase_h * 100).round(1)
|
| 262 |
+
lines.append("\n### 🔄 Drilling Phase Distribution\n")
|
| 263 |
+
lines.append(phase_df.to_markdown(index=False, floatfmt=".1f"))
|
| 264 |
+
|
| 265 |
+
# ── NPT summary ──
|
| 266 |
+
if 'activity_code' in df.columns:
|
| 267 |
+
# Broadened NPT keywords for stricter classification
|
| 268 |
+
npt_keywords = ['npt', 'fishing', 'stuck', 'repair', 'wait', 'sidetrack', 'washout', 'twist off', 'leak', 'loss', 'plug']
|
| 269 |
+
npt_mask = df['activity_code'].str.lower().str.contains('|'.join(npt_keywords), na=False)
|
| 270 |
+
|
| 271 |
+
# Also catch comments mentioning problems
|
| 272 |
+
problem_keywords = ['problem', 'failure', 'broken', 'damage', 'stuck', 'overpull', 'tight']
|
| 273 |
+
# Safeguard if comments column is missing or all null
|
| 274 |
+
if 'comments' in df.columns:
|
| 275 |
+
comment_mask = df['comments'].str.lower().str.contains('|'.join(problem_keywords), na=False)
|
| 276 |
+
else:
|
| 277 |
+
comment_mask = False
|
| 278 |
+
|
| 279 |
+
combined_npt_mask = npt_mask | (comment_mask if isinstance(comment_mask, pd.Series) else False)
|
| 280 |
+
npt_df = df[combined_npt_mask]
|
| 281 |
+
|
| 282 |
+
if not npt_df.empty:
|
| 283 |
+
npt_total = npt_df['duration_hours'].sum() if 'duration_hours' in npt_df.columns else len(npt_df)
|
| 284 |
+
lines.append(f"\n### ⚠️ NPT & Operational Events Summary")
|
| 285 |
+
lines.append(f"**Total NPT/Event hours:** {npt_total:.1f} h ({npt_total/total_h*100:.1f}% of total logged time)")
|
| 286 |
+
lines.append(npt_df[['act_start', 'activity_code', 'state_detail', 'duration_hours', 'comments']].head(20).fillna('').to_markdown(index=False))
|
| 287 |
+
|
| 288 |
+
# ── Depth Samples ──
|
| 289 |
+
if 'md_m' in df.columns:
|
| 290 |
+
lines.append("\n### 📏 Depth Progression Sample")
|
| 291 |
+
lines.append(df[['act_start', 'md_m', 'activity_code', 'comments']].dropna(subset=['md_m']).tail(10).to_markdown(index=False))
|
| 292 |
+
|
| 293 |
+
# ── Recent activities sample ──
|
| 294 |
+
cols = [c for c in ['act_start', 'md_m', 'activity_code', 'state', 'duration_hours', 'comments'] if c in df.columns]
|
| 295 |
+
lines.append(f"\n### Recent Activity Sample (last 10 records)\n")
|
| 296 |
+
lines.append(df[cols].tail(10).fillna('').to_markdown(index=False))
|
| 297 |
+
|
| 298 |
+
result = "\n".join(lines)
|
| 299 |
+
if len(result) > 14000:
|
| 300 |
+
return result[:14000] + "\n\n...[TRUNCATED — use more specific queries for details]"
|
| 301 |
+
return result
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ── Tool 3: WITSML Sensor Analyst ─────────────────────────────────────────────
|
| 305 |
+
|
| 306 |
+
class WITSMLAnalystTool(BaseTool):
|
| 307 |
+
name: str = "WITSML_Analyst"
|
| 308 |
+
description: str = (
|
| 309 |
+
"Compute drilling performance statistics from WITSML sensor logs for a specific well. "
|
| 310 |
+
"Accepts any well name variant (typos OK). "
|
| 311 |
+
"Returns: average/max/min ROP (rate of penetration), WOB (weight on bit), RPM, torque, "
|
| 312 |
+
"standpipe pressure, hookload, and available depth range. "
|
| 313 |
+
"Can also save a time-series plot of ROP vs depth if 'plot=true' is in the input. "
|
| 314 |
+
"Input: well name (optionally append ' plot=true' to generate a chart)."
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
def _run(self, query: str) -> str:
|
| 318 |
+
# Parse plot flag and filters
|
| 319 |
+
plot = 'plot=true' in query.lower()
|
| 320 |
+
query = query.lower().replace('plot=true', '').strip()
|
| 321 |
+
|
| 322 |
+
# Extract depth=X-Y
|
| 323 |
+
depth_range = None
|
| 324 |
+
depth_match = re.search(r'depth=([\d\.]+)-([\d\.]+)', query)
|
| 325 |
+
if depth_match:
|
| 326 |
+
depth_range = (float(depth_match.group(1)), float(depth_match.group(2)))
|
| 327 |
+
query = query.replace(depth_match.group(0), '')
|
| 328 |
+
|
| 329 |
+
# Extract section=X
|
| 330 |
+
section_filter = None
|
| 331 |
+
sec_match = re.search(r'section=([\d\.]+)', query)
|
| 332 |
+
if sec_match:
|
| 333 |
+
section_filter = sec_match.group(1)
|
| 334 |
+
query = query.replace(sec_match.group(0), '')
|
| 335 |
+
|
| 336 |
+
well_name = query.replace(',', '').strip()
|
| 337 |
+
|
| 338 |
+
well_slug = _normalize_well(well_name)
|
| 339 |
+
|
| 340 |
+
# Find all WITSML files for this well
|
| 341 |
+
all_files = list(WITSML_DIR.glob("*.csv"))
|
| 342 |
+
matching = [f for f in all_files if f.name.upper().startswith(well_slug.upper() + '__')]
|
| 343 |
+
|
| 344 |
+
if not matching:
|
| 345 |
+
# Fuzzy: find files containing max token overlap with the slug
|
| 346 |
+
tokens = set(well_slug.split('_'))
|
| 347 |
+
scored = []
|
| 348 |
+
for f in all_files:
|
| 349 |
+
file_tokens = set(re.sub(r'_+', '_', f.name.upper()).split('_'))
|
| 350 |
+
score = len(tokens & file_tokens)
|
| 351 |
+
scored.append((score, f))
|
| 352 |
+
scored.sort(reverse=True)
|
| 353 |
+
if scored and scored[0][0] >= 2:
|
| 354 |
+
top_score = scored[0][0]
|
| 355 |
+
matching = [f for s, f in scored if s == top_score]
|
| 356 |
+
|
| 357 |
+
if not matching:
|
| 358 |
+
return (f"❌ No WITSML data found for well `{well_name}` (slug: `{well_slug}`). "
|
| 359 |
+
f"Use the data_inventory_inspector tool to see what wells have WITSML data.")
|
| 360 |
+
|
| 361 |
+
# Prefer Depth-log files (more useful for drilling analysis)
|
| 362 |
+
depth_files = [f for f in matching if 'DEPTH' in f.name.upper() and 'MD_LOG' in f.name.upper()]
|
| 363 |
+
target_files = depth_files if depth_files else matching
|
| 364 |
+
|
| 365 |
+
# If section filter is specified, only use files matching that section
|
| 366 |
+
if section_filter:
|
| 367 |
+
sec_files = [f for f in target_files if section_filter in f.name]
|
| 368 |
+
if sec_files:
|
| 369 |
+
target_files = sec_files
|
| 370 |
+
else:
|
| 371 |
+
return f"❌ No WITSML logs found for section {section_filter} in well {well_name}."
|
| 372 |
+
|
| 373 |
+
# Load and concatenate all matching files
|
| 374 |
+
dfs = []
|
| 375 |
+
for f in target_files:
|
| 376 |
+
try:
|
| 377 |
+
dfs.append(pd.read_csv(f, low_memory=False))
|
| 378 |
+
except Exception:
|
| 379 |
+
pass
|
| 380 |
+
|
| 381 |
+
if not dfs:
|
| 382 |
+
return f"Found {len(matching)} WITSML file(s) but could not read any of them."
|
| 383 |
+
|
| 384 |
+
df = pd.concat(dfs, ignore_index=True)
|
| 385 |
+
matched_well = matching[0].name.split('__')[0]
|
| 386 |
+
|
| 387 |
+
# ── Column mapping: handle alternate column names ──
|
| 388 |
+
COL_MAP = {
|
| 389 |
+
'ROP': ['ROP', 'GS_ROP', 'ROP5', 'ROPIH', 'ROPH'],
|
| 390 |
+
'WOB': ['CWOB', 'WOB'],
|
| 391 |
+
'RPM': ['RPM', 'GS_RPM', 'DRPM', 'TRPM_RT'],
|
| 392 |
+
'TORQUE': ['TQA', 'GS_TQA'],
|
| 393 |
+
'SPP': ['SPPA', 'GS_SPPA'],
|
| 394 |
+
'HOOKLD': ['HKLD', 'GS_HKLD', 'HKLO', 'HKLI'],
|
| 395 |
+
'DEPTH': ['DMEA', 'DEPTH', 'DEPT', 'TVDE'],
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
found_cols = {}
|
| 399 |
+
for key, alts in COL_MAP.items():
|
| 400 |
+
for alt in alts:
|
| 401 |
+
if alt in df.columns:
|
| 402 |
+
if pd.to_numeric(df[alt], errors='coerce').notnull().any():
|
| 403 |
+
found_cols[key] = alt
|
| 404 |
+
break
|
| 405 |
+
|
| 406 |
+
# ── Depth sanity: sentinel removal, feet→meters, cap at 5500m ──
|
| 407 |
+
_NULLS = {-999.25, -999.0, -9999.0, 9999.0, 9999.25}
|
| 408 |
+
if 'DEPTH' in found_cols:
|
| 409 |
+
d_col = found_cols['DEPTH']
|
| 410 |
+
df[d_col] = pd.to_numeric(df[d_col], errors='coerce')
|
| 411 |
+
df[d_col] = df[d_col].where(~df[d_col].isin(_NULLS))
|
| 412 |
+
med = df[d_col].median()
|
| 413 |
+
if pd.notna(med) and med > 5000: # likely feet → convert
|
| 414 |
+
df[d_col] = df[d_col] * 0.3048
|
| 415 |
+
df[d_col] = df[d_col].clip(upper=5500)
|
| 416 |
+
|
| 417 |
+
# Filter by depth if specified and available
|
| 418 |
+
if depth_range and 'DEPTH' in found_cols:
|
| 419 |
+
d_col = found_cols['DEPTH']
|
| 420 |
+
df_filtered = df[(df[d_col] >= depth_range[0]) & (df[d_col] <= depth_range[1])]
|
| 421 |
+
if not df_filtered.empty:
|
| 422 |
+
df = df_filtered
|
| 423 |
+
|
| 424 |
+
title_suffix = ""
|
| 425 |
+
if section_filter: title_suffix += f" | Section: {section_filter}\""
|
| 426 |
+
if depth_range: title_suffix += f" | Depth: {depth_range[0]}-{depth_range[1]}m"
|
| 427 |
+
|
| 428 |
+
lines = [f"## WITSML Sensor Analysis — Well: `{matched_well}`{title_suffix}\n"]
|
| 429 |
+
lines.append(f"**Source files:** {len(target_files)} | **Total rows:** {len(df):,}")
|
| 430 |
+
|
| 431 |
+
lines.append(f"\n**Mapped columns:** {found_cols}\n")
|
| 432 |
+
|
| 433 |
+
stats_rows = []
|
| 434 |
+
for param, col in found_cols.items():
|
| 435 |
+
s = pd.to_numeric(df[col], errors='coerce').dropna()
|
| 436 |
+
|
| 437 |
+
# Robust filtering for ROP (Rate of Penetration)
|
| 438 |
+
if param == 'ROP':
|
| 439 |
+
# Only include data where ROP is physically plausible (e.g., 0.1 to 300 m/hr)
|
| 440 |
+
# This excludes noise and non-drilling time (zeros)
|
| 441 |
+
s = s[(s > 0.5) & (s < 500)]
|
| 442 |
+
|
| 443 |
+
if len(s) == 0:
|
| 444 |
+
continue
|
| 445 |
+
|
| 446 |
+
stats_rows.append({
|
| 447 |
+
'Parameter': param,
|
| 448 |
+
'Column': col,
|
| 449 |
+
'Mean': round(s.mean(), 2),
|
| 450 |
+
'Median': round(s.median(), 2),
|
| 451 |
+
'Max': round(s.max(), 2),
|
| 452 |
+
'Min': round(s.min(), 2),
|
| 453 |
+
'StdDev': round(s.std(), 2),
|
| 454 |
+
'N': len(s)
|
| 455 |
+
})
|
| 456 |
+
|
| 457 |
+
if stats_rows:
|
| 458 |
+
stats_df = pd.DataFrame(stats_rows)
|
| 459 |
+
lines.append("### Drilling Performance Statistics\n")
|
| 460 |
+
lines.append(stats_df.to_markdown(index=False))
|
| 461 |
+
|
| 462 |
+
# ── Depth summary ──
|
| 463 |
+
if 'DEPTH' in found_cols:
|
| 464 |
+
depth_col = found_cols['DEPTH']
|
| 465 |
+
depth_s = pd.to_numeric(df[depth_col], errors='coerce').dropna()
|
| 466 |
+
if len(depth_s) > 0:
|
| 467 |
+
lines.append(f"\n**Total drilled depth range:** {depth_s.min():.0f} m — {depth_s.max():.0f} m MD")
|
| 468 |
+
lines.append(f"**Net drilled footage:** {depth_s.max() - depth_s.min():.0f} m")
|
| 469 |
+
|
| 470 |
+
# ── Optional: generate ROP vs Depth plot ──
|
| 471 |
+
if plot and 'ROP' in found_cols and 'DEPTH' in found_cols:
|
| 472 |
+
try:
|
| 473 |
+
rop_col = found_cols['ROP']
|
| 474 |
+
dep_col = found_cols['DEPTH']
|
| 475 |
+
plot_df = df[[dep_col, rop_col]].copy()
|
| 476 |
+
plot_df[rop_col] = pd.to_numeric(plot_df[rop_col], errors='coerce')
|
| 477 |
+
plot_df[dep_col] = pd.to_numeric(plot_df[dep_col], errors='coerce')
|
| 478 |
+
plot_df = plot_df.dropna()
|
| 479 |
+
plot_df = plot_df[plot_df[rop_col] > 0] # Only while drilling
|
| 480 |
+
|
| 481 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
| 482 |
+
ax.scatter(plot_df[rop_col], plot_df[dep_col], alpha=0.3, s=5, color='steelblue')
|
| 483 |
+
ax.invert_yaxis()
|
| 484 |
+
ax.set_xlabel('ROP (m/hr)')
|
| 485 |
+
ax.set_ylabel('Depth (m MD)')
|
| 486 |
+
ax.set_title(f'ROP vs Depth — {matched_well}')
|
| 487 |
+
ax.grid(True, alpha=0.3)
|
| 488 |
+
plt.tight_layout()
|
| 489 |
+
out_path = OUTPUTS_DIR / f"{well_slug}_rop_profile.png"
|
| 490 |
+
plt.savefig(out_path, dpi=100)
|
| 491 |
+
plt.close()
|
| 492 |
+
lines.append(f"\n📊 **Chart saved:** `{out_path}`")
|
| 493 |
+
except Exception as e:
|
| 494 |
+
lines.append(f"\n⚠️ Could not generate chart: {e}")
|
| 495 |
+
|
| 496 |
+
return "\n".join(lines)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
# ── Tool 4: Cross-Well Comparison ─────────────────────────────────────────────
|
| 500 |
+
|
| 501 |
+
class CrossWellCompareTool(BaseTool):
|
| 502 |
+
name: str = "CrossWell_Comparison"
|
| 503 |
+
description: str = (
|
| 504 |
+
"Compare DDR activity statistics AND WITSML drilling performance between multiple wells side by side. "
|
| 505 |
+
"Generates a comparison bar chart saved to outputs/figures/. "
|
| 506 |
+
"Input: well names separated by ' vs ' or ' and ', e.g. 'Well A vs Well B vs Well C'."
|
| 507 |
+
"Accepts typos and different name formats."
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
def _run(self, query: str) -> str:
|
| 511 |
+
# Parse multiple well names (separated by vs, and, or commas)
|
| 512 |
+
well_names = []
|
| 513 |
+
# Normalise separators to ' vs ' then split
|
| 514 |
+
norm_query = re.sub(r'(\s+and\s+|,)', ' vs ', query, flags=re.IGNORECASE)
|
| 515 |
+
parts = [p.strip() for p in re.split(r'\s+vs\.?\s+', norm_query, flags=re.IGNORECASE) if p.strip()]
|
| 516 |
+
|
| 517 |
+
if len(parts) < 2:
|
| 518 |
+
return "❌ Please provide at least two well names, e.g. '15/9-19 A vs 15/9-19 B vs 15/9-F-1 C'"
|
| 519 |
+
|
| 520 |
+
results = []
|
| 521 |
+
for wname in parts:
|
| 522 |
+
slug = _normalize_well(wname)
|
| 523 |
+
wresult = {
|
| 524 |
+
'user_name': wname,
|
| 525 |
+
'slug': slug,
|
| 526 |
+
'matched_name': wname,
|
| 527 |
+
'total_hours': 0,
|
| 528 |
+
'max_depth_m': 0,
|
| 529 |
+
'npt_hours': 0,
|
| 530 |
+
'avg_rop': 0,
|
| 531 |
+
'bha_summary': 'N/A'
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
# DDR stats
|
| 535 |
+
ddr_path = _fuzzy_find_well_file(wname, "_activities.csv")
|
| 536 |
+
if ddr_path:
|
| 537 |
+
try:
|
| 538 |
+
df = pd.read_csv(ddr_path)
|
| 539 |
+
df['duration_hours'] = pd.to_numeric(df.get('duration_hours', pd.Series()), errors='coerce')
|
| 540 |
+
df['md_m'] = pd.to_numeric(df.get('md_m', pd.Series()), errors='coerce')
|
| 541 |
+
wresult['total_hours'] = df['duration_hours'].sum()
|
| 542 |
+
wresult['max_depth_m'] = df['md_m'].max()
|
| 543 |
+
wresult['matched_name'] = ddr_path.name.replace('_activities.csv', '').replace('_', '/')
|
| 544 |
+
|
| 545 |
+
# NPT
|
| 546 |
+
npt_kw = ['npt', 'fishing', 'stuck', 'repair', 'wait', 'sidetrack', 'washout']
|
| 547 |
+
if 'activity_code' in df.columns:
|
| 548 |
+
npt_mask = df['activity_code'].str.lower().str.contains('|'.join(npt_kw), na=False)
|
| 549 |
+
wresult['npt_hours'] = df.loc[npt_mask, 'duration_hours'].sum()
|
| 550 |
+
except Exception as e:
|
| 551 |
+
wresult['ddr_error'] = str(e)
|
| 552 |
+
|
| 553 |
+
# WITSML ROP
|
| 554 |
+
witsml_files = list(WITSML_DIR.glob(f"{slug}__*MD_Log*.csv"))
|
| 555 |
+
if not witsml_files:
|
| 556 |
+
witsml_files = list(WITSML_DIR.glob(f"{slug}__*.csv"))
|
| 557 |
+
if witsml_files:
|
| 558 |
+
try:
|
| 559 |
+
dfs = []
|
| 560 |
+
for f in witsml_files[:5]: # limit files loaded
|
| 561 |
+
dfs.append(pd.read_csv(f, low_memory=False))
|
| 562 |
+
wdf = pd.concat(dfs, ignore_index=True)
|
| 563 |
+
for rop_col in ['ROP', 'GS_ROP', 'ROP5', 'ROPIH']:
|
| 564 |
+
if rop_col in wdf.columns:
|
| 565 |
+
s = pd.to_numeric(wdf[rop_col], errors='coerce').dropna()
|
| 566 |
+
s = s[s > 0]
|
| 567 |
+
if len(s) > 0:
|
| 568 |
+
wresult['avg_rop'] = round(s.mean(), 2)
|
| 569 |
+
break
|
| 570 |
+
except Exception as e:
|
| 571 |
+
wresult['witsml_error'] = str(e)
|
| 572 |
+
|
| 573 |
+
# --- Attempt to pull basic BHA info from EDM ---
|
| 574 |
+
try:
|
| 575 |
+
well_f = EDM_DIR / "edm_CD_WELL.csv"
|
| 576 |
+
comp_f = EDM_DIR / "edm_CD_ASSEMBLY_COMP.csv"
|
| 577 |
+
if well_f.exists() and comp_f.exists():
|
| 578 |
+
df_well = pd.read_csv(well_f)
|
| 579 |
+
df_comp = pd.read_csv(comp_f, low_memory=False)
|
| 580 |
+
# Find well id using startswith for flexibility
|
| 581 |
+
if 'well_common_name' in df_well.columns:
|
| 582 |
+
df_well['slug'] = df_well['well_common_name'].apply(lambda x: _normalize_well(str(x)))
|
| 583 |
+
else:
|
| 584 |
+
df_well['slug'] = df_well['well_legal_name'].apply(lambda x: _normalize_well(str(x)))
|
| 585 |
+
|
| 586 |
+
match_mask = df_well['slug'].apply(
|
| 587 |
+
lambda x: isinstance(x, str) and (x in slug or slug in x)
|
| 588 |
+
)
|
| 589 |
+
if match_mask.any():
|
| 590 |
+
# Use shortest valid match
|
| 591 |
+
matches = df_well[match_mask].copy()
|
| 592 |
+
matches['slug_len'] = matches['slug'].apply(len)
|
| 593 |
+
w_id = matches.sort_values('slug_len')['well_id'].iloc[0]
|
| 594 |
+
# Find assemblies for this well
|
| 595 |
+
w_comps = df_comp[df_comp['well_id'] == w_id]
|
| 596 |
+
if not w_comps.empty:
|
| 597 |
+
bits_df = w_comps[w_comps['comp_type_code'].str.upper() == 'BIT']
|
| 598 |
+
motors_df = w_comps[w_comps['comp_type_code'].str.upper() == 'STM']
|
| 599 |
+
|
| 600 |
+
def _format_comp(cdf):
|
| 601 |
+
items = []
|
| 602 |
+
for _, row in cdf.iterrows():
|
| 603 |
+
name = str(row.get('comp_name', '')).strip()
|
| 604 |
+
od = str(row.get('outer_diameter', '')).strip()
|
| 605 |
+
|
| 606 |
+
if name and name.lower() != 'nan':
|
| 607 |
+
items.append(name)
|
| 608 |
+
elif od and od.lower() != 'nan':
|
| 609 |
+
items.append(f"{od}\" OD")
|
| 610 |
+
else:
|
| 611 |
+
items.append("Present")
|
| 612 |
+
return list(set(items))
|
| 613 |
+
|
| 614 |
+
bits = _format_comp(bits_df)
|
| 615 |
+
motors = _format_comp(motors_df)
|
| 616 |
+
|
| 617 |
+
summary_parts = []
|
| 618 |
+
if len(bits) > 0:
|
| 619 |
+
summary_parts.append(f"Bits: {', '.join(bits[:2])}")
|
| 620 |
+
if len(motors) > 0:
|
| 621 |
+
summary_parts.append(f"Motors: {', '.join(motors[:2])}")
|
| 622 |
+
|
| 623 |
+
if summary_parts:
|
| 624 |
+
wresult['bha_summary'] = ' | '.join(summary_parts)
|
| 625 |
+
except Exception as e:
|
| 626 |
+
pass # Non-fatal if BHA can't be found
|
| 627 |
+
|
| 628 |
+
results.append(wresult)
|
| 629 |
+
|
| 630 |
+
# ── Format text comparison ──
|
| 631 |
+
lines = [f"## ⚔️ Multi-Well Comparison\n"]
|
| 632 |
+
metric_rows = []
|
| 633 |
+
for wr in results:
|
| 634 |
+
row = {
|
| 635 |
+
'Well': wr['matched_name'],
|
| 636 |
+
'Max Depth (m)': f"{wr.get('max_depth_m', 0):.0f}" if wr['max_depth_m'] > 0 else 'N/A',
|
| 637 |
+
'Total Hours': f"{wr.get('total_hours', 0):.1f}",
|
| 638 |
+
'NPT Hours': f"{wr.get('npt_hours', 0):.1f}",
|
| 639 |
+
'Avg ROP (m/hr)': f"{wr.get('avg_rop', 0):.2f}" if wr['avg_rop'] > 0 else 'N/A',
|
| 640 |
+
'BHA Focus': wr.get('bha_summary', 'N/A')
|
| 641 |
+
}
|
| 642 |
+
metric_rows.append(row)
|
| 643 |
+
|
| 644 |
+
lines.append(pd.DataFrame(metric_rows).to_markdown(index=False))
|
| 645 |
+
|
| 646 |
+
# ── Generate chart ──
|
| 647 |
+
try:
|
| 648 |
+
labels = [wr['matched_name'] for wr in results]
|
| 649 |
+
depths = [wr.get('max_depth_m', 0) for wr in results]
|
| 650 |
+
avg_rops = [wr.get('avg_rop', 0) for wr in results]
|
| 651 |
+
npt_hours = [wr.get('npt_hours', 0) for wr in results]
|
| 652 |
+
|
| 653 |
+
fig, axes = plt.subplots(1, 3, figsize=(15, 6))
|
| 654 |
+
fig.suptitle(f"Drilling Performance Comparison", fontsize=14, fontweight='bold')
|
| 655 |
+
|
| 656 |
+
# Dynamic colors
|
| 657 |
+
cmap = plt.cm.get_cmap('viridis', len(labels))
|
| 658 |
+
colors = [cmap(i) for i in range(len(labels))]
|
| 659 |
+
|
| 660 |
+
axes[0].bar(labels, depths, color=colors)
|
| 661 |
+
axes[0].set_title('Max Depth (m)')
|
| 662 |
+
axes[0].tick_params(axis='x', rotation=45)
|
| 663 |
+
|
| 664 |
+
axes[1].bar(labels, avg_rops, color=colors)
|
| 665 |
+
axes[1].set_title('Avg ROP (m/hr)')
|
| 666 |
+
axes[1].tick_params(axis='x', rotation=45)
|
| 667 |
+
|
| 668 |
+
axes[2].bar(labels, npt_hours, color=colors)
|
| 669 |
+
axes[2].set_title('Total NPT Hours')
|
| 670 |
+
axes[2].tick_params(axis='x', rotation=45)
|
| 671 |
+
|
| 672 |
+
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
|
| 673 |
+
chart_path = OUTPUTS_DIR / "comparison.png"
|
| 674 |
+
plt.savefig(chart_path, dpi=100, bbox_inches='tight')
|
| 675 |
+
plt.close()
|
| 676 |
+
lines.append(f"\n📊 **Comparison chart saved:** `{chart_path}`")
|
| 677 |
+
except Exception as e:
|
| 678 |
+
lines.append(f"\n⚠️ Could not generate chart: {e}")
|
| 679 |
+
|
| 680 |
+
return "\n".join(lines)
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
# ── Tool 5: Python Interpreter ────────────────────────────────────────────────
|
| 684 |
+
|
| 685 |
+
class EDMTechnicalTool(BaseTool):
|
| 686 |
+
name: str = "EDM_Technical_Query"
|
| 687 |
+
description: str = (
|
| 688 |
+
"Queries technical data for a well: Formation Tops, Casing strings, and BHA (Assembly). "
|
| 689 |
+
"Use this for 'complete' well comparisons or when asked about specific depths/geology."
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
def _run(self, well_name: str) -> str:
|
| 693 |
+
slug = _normalize_well(well_name)
|
| 694 |
+
|
| 695 |
+
# 1. Formation Tops
|
| 696 |
+
formation_f = EDM_DIR / "edm_CD_WELLBORE_FORMATION.csv"
|
| 697 |
+
well_f = EDM_DIR / "edm_CD_WELL.csv"
|
| 698 |
+
|
| 699 |
+
output = [f"## Technical Specification: `{well_name}`"]
|
| 700 |
+
|
| 701 |
+
try:
|
| 702 |
+
if well_f.exists():
|
| 703 |
+
df_well = pd.read_csv(well_f)
|
| 704 |
+
# EDM well names are sometimes just 'F-1' or '15/9-19'
|
| 705 |
+
# Check for slugs in well_common_name or well_legal_name
|
| 706 |
+
df_well['slug'] = df_well['well_common_name'].apply(lambda x: _normalize_well(str(x)))
|
| 707 |
+
|
| 708 |
+
# If query is '15/9-19 A', slug is '15_9_19_A'. But EDM might just have '15/9-19'.
|
| 709 |
+
# So we check if the EDM slug is a prefix of the requested slug.
|
| 710 |
+
# Sort by length descending so we match the most specific well base first.
|
| 711 |
+
df_well['slug_len'] = df_well['slug'].str.len()
|
| 712 |
+
df_well = df_well.sort_values('slug_len', ascending=False)
|
| 713 |
+
|
| 714 |
+
row = pd.DataFrame()
|
| 715 |
+
for _, r in df_well.iterrows():
|
| 716 |
+
if r['slug'] in slug or slug in r['slug']:
|
| 717 |
+
row = pd.DataFrame([r])
|
| 718 |
+
break
|
| 719 |
+
|
| 720 |
+
if row.empty:
|
| 721 |
+
# Try partial match on common name as fallback
|
| 722 |
+
row = df_well[df_well['well_common_name'].str.contains(well_name.replace('_','/').split()[0], na=False)]
|
| 723 |
+
|
| 724 |
+
if not row.empty:
|
| 725 |
+
w_id = row.iloc[0]['well_id']
|
| 726 |
+
|
| 727 |
+
# Resolve wellbore if possible
|
| 728 |
+
wb_id = None
|
| 729 |
+
wb_f = EDM_DIR / "edm_CD_WELLBORE.csv"
|
| 730 |
+
if wb_f.exists():
|
| 731 |
+
df_wb = pd.read_csv(wb_f)
|
| 732 |
+
df_wb_w = df_wb[df_wb['well_id'] == w_id]
|
| 733 |
+
|
| 734 |
+
# Try exact match on legal name first
|
| 735 |
+
wb_exact = df_wb_w[df_wb_w['well_legal_name'].str.contains(well_name, na=False, case=False)]
|
| 736 |
+
if not wb_exact.empty:
|
| 737 |
+
wb_id = wb_exact.iloc[0]['wellbore_id']
|
| 738 |
+
elif not df_wb_w.empty:
|
| 739 |
+
# Fallback to the first wellbore
|
| 740 |
+
wb_id = df_wb_w.iloc[0]['wellbore_id']
|
| 741 |
+
|
| 742 |
+
output.append(f"**Well ID:** {w_id} | **Wellbore ID:** {wb_id or 'N/A'} | **Water Depth:** {row.iloc[0].get('water_depth','N/A')} m")
|
| 743 |
+
|
| 744 |
+
# BHA (Assembly) Data
|
| 745 |
+
assembly_f = EDM_DIR / "edm_CD_ASSEMBLY.csv"
|
| 746 |
+
assembly_comp_f = EDM_DIR / "edm_CD_ASSEMBLY_COMP.csv"
|
| 747 |
+
|
| 748 |
+
if assembly_f.exists() and assembly_comp_f.exists():
|
| 749 |
+
df_assy = pd.read_csv(assembly_f, low_memory=False)
|
| 750 |
+
df_comp = pd.read_csv(assembly_comp_f, low_memory=False)
|
| 751 |
+
|
| 752 |
+
# Find assemblies for this well and wellbore
|
| 753 |
+
w_assy = df_assy[df_assy['well_id'] == w_id]
|
| 754 |
+
if wb_id and 'wellbore_id' in df_assy.columns:
|
| 755 |
+
# Prioritize assembly linked to wellbore, but some might just be linked to well.
|
| 756 |
+
wb_assy = w_assy[w_assy['wellbore_id'] == wb_id]
|
| 757 |
+
if not wb_assy.empty:
|
| 758 |
+
w_assy = wb_assy
|
| 759 |
+
if not w_assy.empty:
|
| 760 |
+
output.append("\n### Bottom Hole Assemblies (BHA)")
|
| 761 |
+
bha_list = []
|
| 762 |
+
|
| 763 |
+
# Critical components for drilling optimization
|
| 764 |
+
focus_comps = ['BIT', 'MWD', 'LWD', 'STM', 'IBS', 'NBS', 'DC', 'HW']
|
| 765 |
+
|
| 766 |
+
# Merge and group assemblies
|
| 767 |
+
for _, assy in w_assy.iterrows():
|
| 768 |
+
a_id = assy['assembly_id']
|
| 769 |
+
a_name = assy.get('assembly_name', 'Unknown Assembly')
|
| 770 |
+
h_size = assy.get('hole_size', 'Unknown')
|
| 771 |
+
|
| 772 |
+
comps = df_comp[df_comp['assembly_id'] == a_id]
|
| 773 |
+
if not comps.empty:
|
| 774 |
+
# Filter to just the important drilling components
|
| 775 |
+
focus_mask = comps['comp_type_code'].isin(focus_comps)
|
| 776 |
+
focus_c = comps[focus_mask].sort_values(by='sequence_no', ascending=False) if 'sequence_no' in comps.columns else comps[focus_mask]
|
| 777 |
+
|
| 778 |
+
if not focus_c.empty:
|
| 779 |
+
# Summarize components
|
| 780 |
+
comp_summary = []
|
| 781 |
+
for _, c in focus_c.iterrows():
|
| 782 |
+
c_type = c['comp_type_code']
|
| 783 |
+
c_desc = str(c.get('description', '')).split(',')[0] # keep it short
|
| 784 |
+
c_od = c.get('od_body', 'N/A')
|
| 785 |
+
comp_summary.append(f"{c_type} ({c_od}\" OD): {c_desc}")
|
| 786 |
+
|
| 787 |
+
bha_list.append({
|
| 788 |
+
'Assembly Name': a_name,
|
| 789 |
+
'Hole Size': h_size,
|
| 790 |
+
'Key Components': ' | '.join(comp_summary)
|
| 791 |
+
})
|
| 792 |
+
|
| 793 |
+
if bha_list:
|
| 794 |
+
output.append(pd.DataFrame(bha_list).to_markdown(index=False))
|
| 795 |
+
|
| 796 |
+
# Casing
|
| 797 |
+
case_f = EDM_DIR / "edm_CD_CASE.csv"
|
| 798 |
+
if case_f.exists():
|
| 799 |
+
df_case = pd.read_csv(case_f)
|
| 800 |
+
w_case = df_case[df_case['well_id'] == w_id]
|
| 801 |
+
if wb_id and 'wellbore_id' in df_case.columns:
|
| 802 |
+
wb_case = w_case[w_case['wellbore_id'] == wb_id]
|
| 803 |
+
if not wb_case.empty:
|
| 804 |
+
w_case = wb_case
|
| 805 |
+
|
| 806 |
+
if not w_case.empty:
|
| 807 |
+
output.append("\n### Casing / Liners")
|
| 808 |
+
# Filter to strings and get basic details
|
| 809 |
+
str_case = w_case[w_case['case_name'].str.contains("Casing|Liner", na=False, case=False)]
|
| 810 |
+
if str_case.empty:
|
| 811 |
+
str_case = w_case
|
| 812 |
+
|
| 813 |
+
cols_to_show = [c for c in ['case_name', 'phase', 'job_pipe_size'] if c in str_case.columns]
|
| 814 |
+
if cols_to_show:
|
| 815 |
+
output.append(str_case[cols_to_show].head(10).to_markdown(index=False))
|
| 816 |
+
|
| 817 |
+
# Formations
|
| 818 |
+
formation_f = EDM_DIR / "edm_CD_WELLBORE_FORMATION.csv"
|
| 819 |
+
if formation_f.exists():
|
| 820 |
+
df_form = pd.read_csv(formation_f)
|
| 821 |
+
w_form = df_form[df_form['well_id'] == w_id]
|
| 822 |
+
if wb_id and 'wellbore_id' in df_form.columns:
|
| 823 |
+
wb_form = w_form[w_form['wellbore_id'] == wb_id]
|
| 824 |
+
if not wb_form.empty:
|
| 825 |
+
w_form = wb_form
|
| 826 |
+
|
| 827 |
+
if not w_form.empty:
|
| 828 |
+
output.append("\n### Formation Tops")
|
| 829 |
+
# Sort by depth if available
|
| 830 |
+
sort_col = 'prognosed_md' if 'prognosed_md' in w_form.columns else w_form.columns[0]
|
| 831 |
+
w_form = w_form.sort_values(by=sort_col)
|
| 832 |
+
|
| 833 |
+
cols_to_show = [c for c in ['formation_name', 'prognosed_md', 'prognosed_tvd'] if c in w_form.columns]
|
| 834 |
+
if cols_to_show:
|
| 835 |
+
output.append(w_form[cols_to_show].head(10).to_markdown(index=False))
|
| 836 |
+
|
| 837 |
+
if len(output) <= 1:
|
| 838 |
+
return f"No EDM records found for {well_name}."
|
| 839 |
+
|
| 840 |
+
return "\n".join(output)
|
| 841 |
+
except Exception as e:
|
| 842 |
+
return f"Error querying EDM: {e}"
|
| 843 |
+
|
| 844 |
+
class PythonTool(BaseTool):
|
| 845 |
+
name: str = "python_interpreter"
|
| 846 |
+
description: str = (
|
| 847 |
+
"Execute Python code (Pandas, Plotly, Numpy) for custom data analysis. "
|
| 848 |
+
"Use for Days-vs-Depth charts, ROP correlations, NPT analysis, statistical filtering, or multi-signal plots.\n"
|
| 849 |
+
"**MANDATORY RULES — violations cause FileNotFoundError or wrong charts:**\n"
|
| 850 |
+
"1. NEVER construct file paths manually. NEVER use pd.read_csv('/data/...') or ANY hardcoded path.\n"
|
| 851 |
+
" The paths /data/ddr/, /data/processed/, etc. DO NOT EXIST. Use ONLY load_ddr() and load_witsml().\n"
|
| 852 |
+
"2. Load DDR with: `df = load_ddr('15/9-F-12')` — columns: md_m (metres), activity_code, duration_hours, act_start, comments.\n"
|
| 853 |
+
"3. days_vs_depth() is ONLY for explicit 'days vs depth' or 'drilling timeline' requests. "
|
| 854 |
+
"NEVER call days_vs_depth() for NPT analysis, phase distribution, ROP charts, or any other chart type — "
|
| 855 |
+
"use load_ddr() + load_witsml() directly for those. "
|
| 856 |
+
"When called: `dvd = days_vs_depth('15/9-F-12')` — already cleaned, monotonic depth, correct time axis. "
|
| 857 |
+
"Plot dvd['days_from_spud'] (x) vs dvd['max_depth_m'] (y, inverted). DO NOT use raw df['md_m'] for D-vs-D.\n"
|
| 858 |
+
"4. Load WITSML with: `df, cols = load_witsml('15/9-F-12')` — depth is in metres (auto-converted). "
|
| 859 |
+
"Always use `df[cols['ROP']]` not `df['ROP']`. Filter ROP > 0 to exclude non-drilling rows.\n"
|
| 860 |
+
"5. Save charts with: `save_plotly_html(fig, 'chart_name')` — automatically saves HTML + PNG.\n"
|
| 861 |
+
"6. Depth sanity: all depths are in metres MD, max ~3500m for F-12, ~5200m for deepest Volve well.\n"
|
| 862 |
+
"Pre-injected: DDR_DIR, WITSML_DIR, EDM_DIR, px, go, load_ddr(), load_witsml(), days_vs_depth(), save_plotly_html().\n"
|
| 863 |
+
"EDM tables: edm_CD_WELL.csv, edm_CD_HOLE_SECT.csv, edm_CD_ASSEMBLY_COMP.csv, edm_CD_WELLBORE_FORMATION.csv.\n"
|
| 864 |
+
"Always print() results. Input: direct Python code string."
|
| 865 |
+
)
|
| 866 |
+
|
| 867 |
+
def _run(self, code: str) -> str:
|
| 868 |
+
# Hard-code absolute paths at injection time — agent must NOT construct paths manually
|
| 869 |
+
ddr_abs = str(BASE_DIR / "data" / "processed" / "ddr")
|
| 870 |
+
witsml_abs = str(BASE_DIR / "data" / "processed" / "witsml")
|
| 871 |
+
edm_abs = str(BASE_DIR / "data" / "processed" / "edm")
|
| 872 |
+
outputs_abs = str(BASE_DIR / "outputs" / "figures")
|
| 873 |
+
|
| 874 |
+
full_code = f"""
|
| 875 |
+
import pandas as pd
|
| 876 |
+
import numpy as np
|
| 877 |
+
import matplotlib.pyplot as plt
|
| 878 |
+
import plotly.express as px
|
| 879 |
+
import plotly.graph_objects as go
|
| 880 |
+
from pathlib import Path
|
| 881 |
+
import re as _re
|
| 882 |
+
|
| 883 |
+
# ── Pre-resolved absolute paths (DO NOT reconstruct these) ──
|
| 884 |
+
DDR_DIR = Path(r"{ddr_abs}")
|
| 885 |
+
WITSML_DIR = Path(r"{witsml_abs}")
|
| 886 |
+
EDM_DIR = Path(r"{edm_abs}")
|
| 887 |
+
OUTPUTS_DIR = Path(r"{outputs_abs}")
|
| 888 |
+
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 889 |
+
|
| 890 |
+
# ── WITSML column aliases (DMEA first = most reliable measured depth) ──
|
| 891 |
+
_WITSML_COL_MAP = {{
|
| 892 |
+
'ROP': ['ROP5', 'GS_ROP', 'ROPIH', 'ROPH', 'ROP'],
|
| 893 |
+
'WOB': ['CWOB', 'WOB'],
|
| 894 |
+
'RPM': ['RPM', 'GS_RPM', 'DRPM', 'TRPM_RT'],
|
| 895 |
+
'TORQUE': ['TQA', 'GS_TQA'],
|
| 896 |
+
'SPP': ['SPPA', 'GS_SPPA'],
|
| 897 |
+
'HOOKLD': ['HKLD', 'GS_HKLD', 'HKLO', 'HKLI'],
|
| 898 |
+
'DEPTH': ['DMEA', 'DEPTH', 'DEPT', 'TVDE'], # DMEA first — DEPT can be a row index
|
| 899 |
+
}}
|
| 900 |
+
|
| 901 |
+
# Maximum realistic depth for any Volve well (deepest is ~5200m MD)
|
| 902 |
+
_VOLVE_MAX_DEPTH_M = 5500
|
| 903 |
+
# Sentinel / null values used in WITSML/LAS exports
|
| 904 |
+
_WITSML_NULLS = {{-999.25, -999.0, -9999.0, 9999.0, 9999.25, 99999.0}}
|
| 905 |
+
|
| 906 |
+
def _well_slug(name):
|
| 907 |
+
s = _re.sub(r'[\\s/\\-]+', '_', name.strip().upper())
|
| 908 |
+
s = _re.sub(r'^NO_', '', s)
|
| 909 |
+
return _re.sub(r'_+', '_', s).strip('_')
|
| 910 |
+
|
| 911 |
+
def _clean_depth(series):
|
| 912 |
+
\"\"\"Coerce to numeric, remove nulls/sentinels, convert ft→m if median > 5000.\"\"\"
|
| 913 |
+
s = pd.to_numeric(series, errors='coerce')
|
| 914 |
+
# Remove WITSML sentinel values
|
| 915 |
+
s = s[~s.isin(_WITSML_NULLS) & s.notna() & (s >= 0)]
|
| 916 |
+
if s.empty:
|
| 917 |
+
return s
|
| 918 |
+
# Auto-detect feet: if median depth exceeds 5000 it cannot be metres for Volve
|
| 919 |
+
if s.median() > 5000:
|
| 920 |
+
s = s / 3.28084 # convert ft → m
|
| 921 |
+
# Cap at maximum realistic well depth
|
| 922 |
+
s = s[s <= _VOLVE_MAX_DEPTH_M]
|
| 923 |
+
return s
|
| 924 |
+
|
| 925 |
+
def load_ddr(well_name, drilling_only=False):
|
| 926 |
+
\"\"\"
|
| 927 |
+
Load DDR activities CSV for a well.
|
| 928 |
+
Columns: act_start, md_m (meters MD), activity_code, state, duration_hours, comments.
|
| 929 |
+
drilling_only=True keeps only depth-advancing activities (drill/slide) for D-vs-D charts.
|
| 930 |
+
IMPORTANT: For Days vs Depth charts use cummax() on md_m — do NOT plot raw md_m directly
|
| 931 |
+
(depth can repeat/jump during POOH). See days_vs_depth() helper below.
|
| 932 |
+
\"\"\"
|
| 933 |
+
slug = _well_slug(well_name)
|
| 934 |
+
candidates = list(DDR_DIR.glob("*_activities.csv"))
|
| 935 |
+
match = None
|
| 936 |
+
for c in candidates:
|
| 937 |
+
if c.stem.upper().startswith(slug):
|
| 938 |
+
match = c; break
|
| 939 |
+
if not match:
|
| 940 |
+
for c in candidates:
|
| 941 |
+
if slug in c.stem.upper():
|
| 942 |
+
match = c; break
|
| 943 |
+
if not match:
|
| 944 |
+
toks = set(slug.split('_'))
|
| 945 |
+
best, best_f = 0, None
|
| 946 |
+
for c in candidates:
|
| 947 |
+
s = len(toks & set(c.stem.upper().split('_')))
|
| 948 |
+
if s > best: best, best_f = s, c
|
| 949 |
+
if best >= 2: match = best_f
|
| 950 |
+
if not match:
|
| 951 |
+
print(f"ERROR: No DDR file found for '{{well_name}}' (slug: {{slug}})")
|
| 952 |
+
print(f"Available: {{[c.name for c in candidates[:8]]}}")
|
| 953 |
+
return pd.DataFrame()
|
| 954 |
+
print(f"Loading DDR: {{match.name}}")
|
| 955 |
+
df = pd.read_csv(match)
|
| 956 |
+
if 'duration_hours' in df.columns:
|
| 957 |
+
df['duration_hours'] = pd.to_numeric(df['duration_hours'], errors='coerce')
|
| 958 |
+
if 'md_m' in df.columns:
|
| 959 |
+
df['md_m'] = pd.to_numeric(df['md_m'], errors='coerce').clip(lower=0, upper=_VOLVE_MAX_DEPTH_M)
|
| 960 |
+
if drilling_only and 'activity_code' in df.columns:
|
| 961 |
+
mask = df['activity_code'].str.lower().str.contains('drill', na=False)
|
| 962 |
+
df = df[mask]
|
| 963 |
+
return df
|
| 964 |
+
|
| 965 |
+
def days_vs_depth(well_name):
|
| 966 |
+
\"\"\"
|
| 967 |
+
Build a clean Days-vs-Depth DataFrame for plotting.
|
| 968 |
+
Returns df with columns: days_from_spud (float), max_depth_m (float), activity_code.
|
| 969 |
+
Only includes the DRILLING CAMPAIGN (stops at max depth — no completion/workover extension).
|
| 970 |
+
The max_depth_m column is monotonically non-decreasing (industry standard D-vs-D).
|
| 971 |
+
\"\"\"
|
| 972 |
+
df = load_ddr(well_name)
|
| 973 |
+
if df.empty or 'act_start' not in df.columns:
|
| 974 |
+
return pd.DataFrame()
|
| 975 |
+
df = df[df['md_m'] > 0].copy()
|
| 976 |
+
df['act_start'] = pd.to_datetime(df['act_start'], errors='coerce')
|
| 977 |
+
df = df.dropna(subset=['act_start']).sort_values('act_start').reset_index(drop=True)
|
| 978 |
+
t0 = df['act_start'].min()
|
| 979 |
+
df['days_from_spud'] = (df['act_start'] - t0).dt.total_seconds() / 86400
|
| 980 |
+
df['max_depth_m'] = df['md_m'].cummax()
|
| 981 |
+
# Trim to drilling campaign: stop when depth stops increasing for >3 days
|
| 982 |
+
td_idx = df['max_depth_m'].idxmax()
|
| 983 |
+
post_td = df.loc[td_idx:, 'activity_code'].str.lower()
|
| 984 |
+
# Find first completion/workover row after TD
|
| 985 |
+
completion_mask = post_td.str.contains('complet|workover|abandon', na=False)
|
| 986 |
+
if completion_mask.any():
|
| 987 |
+
cut = completion_mask.idxmax()
|
| 988 |
+
df = df.loc[:cut]
|
| 989 |
+
else:
|
| 990 |
+
df = df.loc[:td_idx + 5] # allow a small buffer past TD
|
| 991 |
+
print(f"Days-vs-Depth for {{well_name}}: {{len(df)}} points, "
|
| 992 |
+
f"TD={{df['max_depth_m'].max():.0f}}m, total={{df['days_from_spud'].max():.1f}} days")
|
| 993 |
+
return df[['days_from_spud', 'max_depth_m', 'activity_code', 'duration_hours']].copy()
|
| 994 |
+
|
| 995 |
+
def load_witsml(well_name):
|
| 996 |
+
\"\"\"
|
| 997 |
+
Load WITSML Depth/MD_Log files for a well.
|
| 998 |
+
Returns (df, cols). Always access columns via df[cols['ROP']] — NEVER df['ROP'].
|
| 999 |
+
Depth is in metres MD (auto-converts from feet if needed, removes sentinels).
|
| 1000 |
+
Available keys: 'ROP', 'WOB', 'RPM', 'TORQUE', 'SPP', 'HOOKLD', 'DEPTH'.
|
| 1001 |
+
\"\"\"
|
| 1002 |
+
slug = _well_slug(well_name)
|
| 1003 |
+
all_files = list(WITSML_DIR.glob("*.csv"))
|
| 1004 |
+
matching = [f for f in all_files if f.name.upper().startswith(slug + '__') and 'MD_LOG' in f.name.upper()]
|
| 1005 |
+
if not matching:
|
| 1006 |
+
matching = [f for f in all_files if slug in f.name.upper() and 'MD_LOG' in f.name.upper()]
|
| 1007 |
+
if not matching:
|
| 1008 |
+
matching = [f for f in all_files if slug in f.name.upper()]
|
| 1009 |
+
dfs = []
|
| 1010 |
+
for f in matching[:6]:
|
| 1011 |
+
try:
|
| 1012 |
+
_df = pd.read_csv(f, low_memory=False)
|
| 1013 |
+
# Per-file: clean any depth-like columns before concat to avoid unit mixing
|
| 1014 |
+
for dc in ['DMEA', 'DEPTH', 'DEPT', 'TVDE']:
|
| 1015 |
+
if dc in _df.columns:
|
| 1016 |
+
cleaned = _clean_depth(_df[dc])
|
| 1017 |
+
# If the cleaned series has < 20% valid rows, this column is not a depth
|
| 1018 |
+
if len(cleaned) < 0.2 * len(_df):
|
| 1019 |
+
_df.drop(columns=[dc], inplace=True, errors='ignore')
|
| 1020 |
+
else:
|
| 1021 |
+
_df[dc] = pd.to_numeric(_df[dc], errors='coerce')
|
| 1022 |
+
# Replace sentinel/out-of-range with NaN
|
| 1023 |
+
_df.loc[~_df[dc].isin(cleaned.index.map(lambda i: _df[dc].iloc[i] if i < len(_df) else None)), dc] = float('nan')
|
| 1024 |
+
dfs.append(_df)
|
| 1025 |
+
except Exception:
|
| 1026 |
+
pass
|
| 1027 |
+
if not dfs:
|
| 1028 |
+
print(f"WARNING: No WITSML files found for '{{well_name}}' (slug: {{slug}})")
|
| 1029 |
+
return pd.DataFrame(), {{}}
|
| 1030 |
+
df = pd.concat(dfs, ignore_index=True)
|
| 1031 |
+
# Resolve column map: pick first alias that has valid data in realistic range
|
| 1032 |
+
cols = {{}}
|
| 1033 |
+
for key, alts in _WITSML_COL_MAP.items():
|
| 1034 |
+
for alt in alts:
|
| 1035 |
+
if alt not in df.columns:
|
| 1036 |
+
continue
|
| 1037 |
+
v = pd.to_numeric(df[alt], errors='coerce')
|
| 1038 |
+
v = v[v.notna() & ~v.isin(_WITSML_NULLS) & (v >= 0)]
|
| 1039 |
+
if key == 'DEPTH':
|
| 1040 |
+
# Extra validation: must have median in realistic drilling depth range
|
| 1041 |
+
if v.empty or v.median() > _VOLVE_MAX_DEPTH_M:
|
| 1042 |
+
continue
|
| 1043 |
+
# Convert feet if needed
|
| 1044 |
+
if v.median() > 5000:
|
| 1045 |
+
df[alt] = df[alt].apply(lambda x: float(x)/3.28084 if pd.notna(x) else x)
|
| 1046 |
+
if len(v) > 10:
|
| 1047 |
+
cols[key] = alt; break
|
| 1048 |
+
# ── Physical-range guard: remove impossible values per parameter ──────────
|
| 1049 |
+
# Wide enough to accept both metric and imperial units; catches 10 000+ garbage.
|
| 1050 |
+
_PHYS = {{
|
| 1051 |
+
'ROP': (0.01, 300), # m/hr or ft/hr — max practical ~200
|
| 1052 |
+
'WOB': (0, 500), # klbs or kN — 500 klbs ≈ 2 225 kN
|
| 1053 |
+
'RPM': (0, 400), # rpm
|
| 1054 |
+
'TORQUE': (0, 150000), # Nm or ft-lbs — wide range
|
| 1055 |
+
'SPP': (0, 10000), # PSI or bar — 10 000 PSI ≈ 690 bar
|
| 1056 |
+
'HOOKLD': (0, 10000), # klbs or kN
|
| 1057 |
+
}}
|
| 1058 |
+
for _param, (_lo, _hi) in _PHYS.items():
|
| 1059 |
+
if _param in cols:
|
| 1060 |
+
_col = cols[_param]
|
| 1061 |
+
df[_col] = pd.to_numeric(df[_col], errors='coerce')
|
| 1062 |
+
# Replace sentinel nulls with NaN
|
| 1063 |
+
df.loc[df[_col].isin(_WITSML_NULLS), _col] = float('nan')
|
| 1064 |
+
# Null out physically impossible values (not clamp — keeps data honest)
|
| 1065 |
+
df.loc[~df[_col].between(_lo, _hi, inclusive='both') & df[_col].notna(), _col] = float('nan')
|
| 1066 |
+
print(f"WITSML for {{well_name}}: {{len(df)}} rows | params: {{list(cols.keys())}}")
|
| 1067 |
+
if 'DEPTH' in cols:
|
| 1068 |
+
dep = pd.to_numeric(df[cols['DEPTH']], errors='coerce').dropna()
|
| 1069 |
+
if not dep.empty:
|
| 1070 |
+
print(f" Depth range: {{dep.min():.0f}}–{{dep.max():.0f}} m MD")
|
| 1071 |
+
for _p, _c in cols.items():
|
| 1072 |
+
if _p != 'DEPTH':
|
| 1073 |
+
_s = pd.to_numeric(df[_c], errors='coerce').dropna()
|
| 1074 |
+
if not _s.empty:
|
| 1075 |
+
print(f" {{_p}} ({{_c}}): mean={{_s.mean():.1f}}, p5={{_s.quantile(0.05):.1f}}, p95={{_s.quantile(0.95):.1f}}")
|
| 1076 |
+
return df, cols
|
| 1077 |
+
|
| 1078 |
+
_VOLVE_MAX_DEPTH_M = 5500
|
| 1079 |
+
_WITSML_NULLS = {{-999.25, -999.0, -9999.0, 9999.0, 9999.25}}
|
| 1080 |
+
|
| 1081 |
+
def _clean_depth_series(s):
|
| 1082 |
+
s = pd.to_numeric(s, errors='coerce')
|
| 1083 |
+
s = s[~s.isin(_WITSML_NULLS) & s.notna() & (s >= 0)]
|
| 1084 |
+
if s.empty: return s
|
| 1085 |
+
if s.median() > 5000: s = s / 3.28084
|
| 1086 |
+
return s[s <= _VOLVE_MAX_DEPTH_M]
|
| 1087 |
+
|
| 1088 |
+
def days_vs_depth(well_name):
|
| 1089 |
+
\"\"\"
|
| 1090 |
+
Return clean Days-vs-Depth DataFrame:
|
| 1091 |
+
days_from_spud (float), max_depth_m (monotonically increasing), activity_code.
|
| 1092 |
+
Automatically trims post-TD completion operations.
|
| 1093 |
+
ALWAYS use this helper for D-vs-D charts — never build from raw DDR.
|
| 1094 |
+
\"\"\"
|
| 1095 |
+
df = load_ddr(well_name)
|
| 1096 |
+
if df.empty or 'act_start' not in df.columns: return pd.DataFrame()
|
| 1097 |
+
df = df[df['md_m'] > 0].copy()
|
| 1098 |
+
df['act_start'] = pd.to_datetime(df['act_start'], errors='coerce')
|
| 1099 |
+
df = df.dropna(subset=['act_start']).sort_values('act_start').reset_index(drop=True)
|
| 1100 |
+
t0 = df['act_start'].min()
|
| 1101 |
+
df['days_from_spud'] = (df['act_start'] - t0).dt.total_seconds() / 86400
|
| 1102 |
+
df['max_depth_m'] = df['md_m'].cummax()
|
| 1103 |
+
td_idx = int(df['max_depth_m'].idxmax())
|
| 1104 |
+
# Cut off post-TD completion/workover
|
| 1105 |
+
post = df.loc[td_idx:, 'activity_code'].str.lower()
|
| 1106 |
+
comp_mask = post.str.contains('complet|workover|abandon', na=False)
|
| 1107 |
+
cut = int(comp_mask.idxmax()) if comp_mask.any() else td_idx + 10
|
| 1108 |
+
df = df.loc[:cut].copy()
|
| 1109 |
+
print(f"days_vs_depth({{well_name}}): {{len(df)}} pts | TD={{df['max_depth_m'].max():.0f}}m | {{df['days_from_spud'].max():.1f}} days")
|
| 1110 |
+
return df[['days_from_spud','max_depth_m','activity_code','duration_hours']].reset_index(drop=True)
|
| 1111 |
+
|
| 1112 |
+
def save_plotly_html(fig, filename_without_ext):
|
| 1113 |
+
\"\"\"Save interactive HTML + PNG snapshot for inline display.\"\"\"
|
| 1114 |
+
html_path = str(OUTPUTS_DIR / f"{{filename_without_ext}}.html")
|
| 1115 |
+
png_path = str(OUTPUTS_DIR / f"{{filename_without_ext}}.png")
|
| 1116 |
+
fig.write_html(html_path, include_plotlyjs='cdn')
|
| 1117 |
+
try:
|
| 1118 |
+
fig.write_image(png_path, width=1000, height=520, scale=1.5)
|
| 1119 |
+
print(f"Chart PNG saved to: {{png_path}}")
|
| 1120 |
+
except Exception as _e:
|
| 1121 |
+
print(f"PNG export skipped: {{_e}}")
|
| 1122 |
+
print(f"Interactive chart saved to: {{html_path}}")
|
| 1123 |
+
|
| 1124 |
+
{code}
|
| 1125 |
+
"""
|
| 1126 |
+
# Save to temp file
|
| 1127 |
+
tmp_script = "/tmp/analyst_script.py"
|
| 1128 |
+
with open(tmp_script, "w") as f:
|
| 1129 |
+
f.write(full_code)
|
| 1130 |
+
|
| 1131 |
+
try:
|
| 1132 |
+
result = subprocess.run(
|
| 1133 |
+
[sys.executable, tmp_script],
|
| 1134 |
+
capture_output=True, text=True, timeout=30
|
| 1135 |
+
)
|
| 1136 |
+
output = result.stdout
|
| 1137 |
+
if result.stderr:
|
| 1138 |
+
output += f"\\nError: {result.stderr}"
|
| 1139 |
+
return output if output.strip() else "Success (No output returned)."
|
| 1140 |
+
except Exception as e:
|
| 1141 |
+
return f"Execution Error: {e}"
|
src/agents/orchestrator.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
orchestrator.py
|
| 3 |
+
---------------
|
| 4 |
+
Hybrid Orchestrator for the Drilling Intelligence System (Phase 6).
|
| 5 |
+
Supports streaming "Thinking" logs and real-time responses.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Generator, Dict, Any
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
from google import genai
|
| 15 |
+
|
| 16 |
+
# Tools
|
| 17 |
+
from src.agents.tools import get_iadc_db, get_volve_db
|
| 18 |
+
# The deep reasoning loop
|
| 19 |
+
from src.agents.crew import run_aggregation_loop
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
log = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 25 |
+
MODEL_NAME = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite-preview")
|
| 26 |
+
_genai_client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
|
| 27 |
+
|
| 28 |
+
# ── Router Tags ──────────────────────────────────────────────────────────────
|
| 29 |
+
ROUTING_IADC = "IADC_Definition"
|
| 30 |
+
ROUTING_VOLVE_HISTORY = "Volve_History"
|
| 31 |
+
ROUTING_DEEP_ANALYST = "Data_Analysis"
|
| 32 |
+
ROUTING_AGGREGATE = "Extrapolation"
|
| 33 |
+
ROUTING_DUAL = "Dual_Search" # New in Phase 6: Multi-source for ambiguous terms
|
| 34 |
+
|
| 35 |
+
# ── 1. Classification Engine ──────────────────────────────────────────────────
|
| 36 |
+
|
| 37 |
+
def classify_question(question: str) -> str:
|
| 38 |
+
"""Heuristic router with Phase 6 'Dual Search' and 'Geophysics' awareness."""
|
| 39 |
+
q_lower = question.lower()
|
| 40 |
+
|
| 41 |
+
# 1. Macro / Lessons
|
| 42 |
+
agg_kw = ["lessons learned", "extrapolate", "summarize", "overall", "compare across"]
|
| 43 |
+
if any(kw in q_lower for kw in agg_kw): return ROUTING_AGGREGATE
|
| 44 |
+
|
| 45 |
+
# 2. Tech Terms that need Dual Search (Theory + Volve Context)
|
| 46 |
+
# Give 65% weight to Volve later in prompt.
|
| 47 |
+
dual_kw = ["wow", "waiting on weather", "npt", "stuck pipe", "milling", "kicks", "losses"]
|
| 48 |
+
if any(kw == q_lower.strip() or f" {kw} " in f" {q_lower} " for kw in dual_kw):
|
| 49 |
+
return ROUTING_DUAL
|
| 50 |
+
|
| 51 |
+
# 3. Geophysics (Formation Tops)
|
| 52 |
+
geo_kw = ["formation", "top", "stratigraphy", "geology", "lithology", "hugin", "shetland", "skagerrak"]
|
| 53 |
+
if any(kw in q_lower for kw in geo_kw): return ROUTING_VOLVE_HISTORY
|
| 54 |
+
|
| 55 |
+
# 4. Numerical / Analytics
|
| 56 |
+
math_kw = ["average", "mean", "max", "min", "trend", "calc", "rop", "rpm", "chart", "table", "plot", "compare"]
|
| 57 |
+
if any(kw in q_lower for kw in math_kw): return ROUTING_DEEP_ANALYST
|
| 58 |
+
|
| 59 |
+
# 5. Volve Historical
|
| 60 |
+
history_kw = ["what happened", "records", "incident", "daily log", "instance"]
|
| 61 |
+
well_pattern = r"(\d{1,2}/\d+-[A-Za-z]+-?\d+(?:\s*[A-Z])?)"
|
| 62 |
+
if "instance" in q_lower or "record" in q_lower or re.search(well_pattern, q_lower):
|
| 63 |
+
return ROUTING_VOLVE_HISTORY
|
| 64 |
+
|
| 65 |
+
return ROUTING_IADC
|
| 66 |
+
|
| 67 |
+
# ── 2. Unified RAG Execution ──────────────────────────────────────────────────
|
| 68 |
+
|
| 69 |
+
def run_fast_rag(question: str, routes: list, persona="Technical Assistant") -> str:
|
| 70 |
+
"""Supports single OR multi-source RAG (Dual Search)."""
|
| 71 |
+
context_blocks = []
|
| 72 |
+
|
| 73 |
+
for route in routes:
|
| 74 |
+
if route == ROUTING_IADC:
|
| 75 |
+
db = get_iadc_db()
|
| 76 |
+
label = "IADC Drilling Glossary (Theory)"
|
| 77 |
+
results = db.similarity_search(
|
| 78 |
+
question, k=4,
|
| 79 |
+
filter={"viking_namespace": "resources/iadc/"}
|
| 80 |
+
)
|
| 81 |
+
# Fallback: unfiltered search if namespace yields nothing
|
| 82 |
+
if not results:
|
| 83 |
+
results = db.similarity_search(question, k=4)
|
| 84 |
+
else:
|
| 85 |
+
db = get_volve_db()
|
| 86 |
+
label = "Volve Field records (Operational History & Formation Picks)"
|
| 87 |
+
results = db.similarity_search(
|
| 88 |
+
question, k=25,
|
| 89 |
+
filter={"viking_namespace": "resources/volve/"}
|
| 90 |
+
)
|
| 91 |
+
if not results:
|
| 92 |
+
results = db.similarity_search(question, k=25)
|
| 93 |
+
|
| 94 |
+
for i, doc in enumerate(results):
|
| 95 |
+
source = doc.metadata.get('source', 'Unknown source')
|
| 96 |
+
if isinstance(source, str) and '/' in source:
|
| 97 |
+
source = source.split('/')[-1]
|
| 98 |
+
context_blocks.append(f"[{label} - Source: {source}]: {doc.page_content}")
|
| 99 |
+
|
| 100 |
+
if not context_blocks:
|
| 101 |
+
return "I couldn't find relevant technical or historical records for this query."
|
| 102 |
+
|
| 103 |
+
context_str = "\n\n".join(context_blocks)
|
| 104 |
+
|
| 105 |
+
# User Request: Technical Chat tone, weight Volve (65%).
|
| 106 |
+
# Align with SPE Challenge grading requirements.
|
| 107 |
+
system_prompt = f"""You are Odin, a strictly professional, highly technical, and analytical engineering AI system.
|
| 108 |
+
TONE: Maintain a serious, formal, and precise engineering tone. Provide logically structured, evidence-based answers.
|
| 109 |
+
DO NOT use casual language.
|
| 110 |
+
|
| 111 |
+
PRIORITY: When answering about operational concepts (like WOW or NPT),
|
| 112 |
+
give 65% more weight and detail to the Volve Field historical examples provided
|
| 113 |
+
over general definitions.
|
| 114 |
+
|
| 115 |
+
LANGUAGE: The Volve source documents may contain Norwegian text (from the Volve PUD and field reports).
|
| 116 |
+
If retrieved context contains Norwegian, translate it to English and present ONLY the English translation.
|
| 117 |
+
Never output Norwegian text to the user. Key translations: foringsrør=casing, borevæske=drilling fluid,
|
| 118 |
+
boreslam=drilling mud, brønn=well, hullseksjon=hole section, borekaks=drill cuttings.
|
| 119 |
+
|
| 120 |
+
EVIDENCE & ASSUMPTIONS: Always clearly state your evidence (e.g., "According to Volve DDR...") and declare any assumptions or confidence levels.
|
| 121 |
+
|
| 122 |
+
ONLY IF the user explicitly asks for a formal report, analysis, or structured breakdown, should you use rigorous sections like ## Evidence, ## Assumptions, etc. Otherwise, maintain a concise but highly professional technical summary.
|
| 123 |
+
|
| 124 |
+
CONTEXT:
|
| 125 |
+
{context_str}
|
| 126 |
+
|
| 127 |
+
QUESTION: {question}"""
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
response = _genai_client.models.generate_content(
|
| 131 |
+
model=MODEL_NAME,
|
| 132 |
+
contents=system_prompt
|
| 133 |
+
)
|
| 134 |
+
return response.text
|
| 135 |
+
except Exception as e:
|
| 136 |
+
return f"LLM Error: {e}"
|
| 137 |
+
|
| 138 |
+
# ── 3. Streaming Orchestrator ─────────────────────────────────────────────────
|
| 139 |
+
|
| 140 |
+
def run_pipeline(question: str, chat_history=None) -> Generator[Dict[str, Any], None, None]:
|
| 141 |
+
"""
|
| 142 |
+
Generator that yields incremental status logs and the final answer.
|
| 143 |
+
"""
|
| 144 |
+
t0 = time.time()
|
| 145 |
+
|
| 146 |
+
def log_evt(icon, name, status, detail=""):
|
| 147 |
+
return {"event": "log", "icon": icon, "name": name, "status": status, "detail": detail, "time": time.time()}
|
| 148 |
+
|
| 149 |
+
# 1. Memory Analysis
|
| 150 |
+
if chat_history:
|
| 151 |
+
yield log_evt("🧠", "Memory", f"Analyzing {len(chat_history)} messages...", "Restoring context.")
|
| 152 |
+
|
| 153 |
+
# 2. Routing
|
| 154 |
+
yield log_evt("🔍", "Classifier", "Analyzing intent...", f"'{question[:50]}...'")
|
| 155 |
+
route = classify_question(question)
|
| 156 |
+
yield log_evt("🔀", "Router", f"Path: Agentic Loop", "Delegating to Multi-Agent Crew.")
|
| 157 |
+
|
| 158 |
+
# 3. Execution
|
| 159 |
+
answer = ""
|
| 160 |
+
charts = []
|
| 161 |
+
|
| 162 |
+
# CrewAI Path (100% routing to allow dynamic tool discovery)
|
| 163 |
+
yield log_evt("🤖", "Rig Crew", "Waking up Agents...", "Initializing reasoning loop.")
|
| 164 |
+
try:
|
| 165 |
+
# run_aggregation_loop is now a generator yielding log/answer events
|
| 166 |
+
for event in run_aggregation_loop(question):
|
| 167 |
+
if event["event"] == "log":
|
| 168 |
+
yield log_evt(event["icon"], event["name"], event["status"], event["detail"])
|
| 169 |
+
elif event["event"] == "final_answer":
|
| 170 |
+
answer = event["answer"]
|
| 171 |
+
elif event["event"] == "verbose_log":
|
| 172 |
+
yield {"event": "verbose_log", "content": event.get("content", "")}
|
| 173 |
+
elif event["event"] == "error":
|
| 174 |
+
answer = f"CrewAI Error: {event['message']}"
|
| 175 |
+
|
| 176 |
+
# Check for charts in outputs/figures
|
| 177 |
+
fig_dir = BASE_DIR / "outputs" / "figures"
|
| 178 |
+
if fig_dir.exists():
|
| 179 |
+
for ext in ["*.png", "*.html"]:
|
| 180 |
+
for p in fig_dir.glob(ext):
|
| 181 |
+
# Only append charts created in the last 2 minutes to avoid old charts
|
| 182 |
+
if time.time() - p.stat().st_mtime < 120:
|
| 183 |
+
if str(p.absolute()) not in charts:
|
| 184 |
+
charts.append(str(p.absolute()))
|
| 185 |
+
except Exception as e:
|
| 186 |
+
answer = f"Agent Error: {e}"
|
| 187 |
+
|
| 188 |
+
elapsed = time.time() - t0
|
| 189 |
+
yield log_evt("✅", "Complete", f"Done in {elapsed:.1f}s", "Finalizing response.")
|
| 190 |
+
|
| 191 |
+
yield {"event": "final_answer", "answer": str(answer), "route": route, "charts": charts}
|
src/agents/promptfoo_provider.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import google.generativeai as genai
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
|
| 9 |
+
|
| 10 |
+
model = genai.GenerativeModel("gemini-3.1-flash-lite-preview")
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
try:
|
| 14 |
+
# Promptfoo passes the prompt string as JSON to stdin
|
| 15 |
+
input_data = sys.stdin.read()
|
| 16 |
+
if not input_data:
|
| 17 |
+
print(json.dumps({"error": "No input provided on stdin"}))
|
| 18 |
+
sys.exit(1)
|
| 19 |
+
|
| 20 |
+
# The prompt is a simple JSON string or object containing '{ "prompt": "..." }'
|
| 21 |
+
parsed = json.loads(input_data)
|
| 22 |
+
|
| 23 |
+
# Depending on how promptfoo calls this, it may be a direct string or a dict
|
| 24 |
+
if isinstance(parsed, dict) and 'prompt' in parsed:
|
| 25 |
+
prompt = parsed['prompt']
|
| 26 |
+
elif isinstance(parsed, str):
|
| 27 |
+
prompt = parsed
|
| 28 |
+
else:
|
| 29 |
+
prompt = str(parsed)
|
| 30 |
+
|
| 31 |
+
response = model.generate_content(prompt)
|
| 32 |
+
|
| 33 |
+
# Promptfoo expects the response inside a JSON object: { "output": "..." }
|
| 34 |
+
print(json.dumps({"output": response.text}))
|
| 35 |
+
sys.exit(0)
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(json.dumps({"error": str(e)}))
|
| 39 |
+
sys.exit(1)
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
main()
|
src/agents/tools.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py
|
| 3 |
+
--------
|
| 4 |
+
Custom Tools for the SPE GCS 2026 ML Challenge Agents.
|
| 5 |
+
|
| 6 |
+
1. StatefulPythonExecutionTool: Safely executes generated Pandas code, keeping state.
|
| 7 |
+
2. IADC_SearchTool: Queries the local IADC ChromaDB for drilling concepts.
|
| 8 |
+
3. VolveHistory_SearchTool: Queries the Volve DDR ChromaDB for historical events.
|
| 9 |
+
"""
|
| 10 |
+
import os
|
| 11 |
+
import io
|
| 12 |
+
import sys
|
| 13 |
+
import pandas as pd
|
| 14 |
+
import numpy as np
|
| 15 |
+
import matplotlib.pyplot as plt
|
| 16 |
+
import plotly.express as px
|
| 17 |
+
import plotly.graph_objects as go
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from crewai.tools import BaseTool
|
| 20 |
+
from pydantic import Field
|
| 21 |
+
from langchain_chroma import Chroma
|
| 22 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
|
| 25 |
+
load_dotenv()
|
| 26 |
+
|
| 27 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 28 |
+
DATA_DIR = BASE_DIR / "data" / "processed"
|
| 29 |
+
DDR_DIR = str(DATA_DIR / "ddr")
|
| 30 |
+
WITSML_DIR = str(DATA_DIR / "witsml")
|
| 31 |
+
OUTPUTS_DIR = BASE_DIR / "outputs" / "figures"
|
| 32 |
+
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# ── 1. Stateful Python Execution Tool ─────────────────────────────────────────
|
| 35 |
+
|
| 36 |
+
class DataInventoryTool(BaseTool):
|
| 37 |
+
name: str = "data_inventory_inspector"
|
| 38 |
+
description: str = "Use this tool to see what data (CSVs, WITSML, Files) are available across DDR_DIR and WITSML_DIR. Returns a summary of wells and datasets."
|
| 39 |
+
|
| 40 |
+
def _run(self, query: str = "") -> str:
|
| 41 |
+
summary = ["### Project Data Inventory"]
|
| 42 |
+
|
| 43 |
+
# DDR Directory
|
| 44 |
+
ddr_path = os.environ.get('DDR_DIR', DDR_DIR)
|
| 45 |
+
if os.path.exists(ddr_path):
|
| 46 |
+
files = os.listdir(ddr_path)
|
| 47 |
+
summary.append(f"\n**DDR Directory ({ddr_path}):**")
|
| 48 |
+
summary.append(f"- Total Files: {len(files)}")
|
| 49 |
+
csvs = [f for f in files if f.endswith('.csv')]
|
| 50 |
+
wells = set([f.split('_activities')[0].split('_daily')[0] for f in csvs if '_' in f])
|
| 51 |
+
summary.append(f"- Detected Wells: {', '.join(sorted(list(wells))[:10])}...")
|
| 52 |
+
if '_ddr_extraction_summary.csv' in files:
|
| 53 |
+
summary.append("- [Key File]: `_ddr_extraction_summary.csv` (High-level well metadata)")
|
| 54 |
+
if '_ddr_all_activities.csv' in files:
|
| 55 |
+
summary.append("- [Key File]: `_ddr_all_activities.csv` (Granular time-log across all wells)")
|
| 56 |
+
|
| 57 |
+
# WITSML Directory
|
| 58 |
+
witsml_path = os.environ.get('WITSML_DIR', WITSML_DIR)
|
| 59 |
+
if os.path.exists(witsml_path):
|
| 60 |
+
wells_witsml = [d for d in os.listdir(witsml_path) if os.path.isdir(os.path.join(witsml_path, d))]
|
| 61 |
+
summary.append(f"\n**WITSML Directory ({witsml_path}):**")
|
| 62 |
+
summary.append(f"- Well Folders: {', '.join(wells_witsml)}")
|
| 63 |
+
|
| 64 |
+
# PDF Reports
|
| 65 |
+
pdf_path = "data/raw/Reports"
|
| 66 |
+
if os.path.exists(pdf_path):
|
| 67 |
+
pdfs = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]
|
| 68 |
+
summary.append(f"\n**PDF Knowledge Source:**")
|
| 69 |
+
summary.append(f"- Reports: {', '.join(pdfs)}")
|
| 70 |
+
|
| 71 |
+
return "\n".join(summary)
|
| 72 |
+
|
| 73 |
+
def save_plotly_html(fig, filename_without_ext):
|
| 74 |
+
"""Helper to be passed to the agent REPL so it can easily save html."""
|
| 75 |
+
path = os.path.join(str(OUTPUTS_DIR), f"{filename_without_ext}.html")
|
| 76 |
+
fig.write_html(path)
|
| 77 |
+
print(f"Interactive Plotly chart saved to: {path}")
|
| 78 |
+
|
| 79 |
+
# Global REPL state so variables persist between tool calls in the same run
|
| 80 |
+
_repl_globals = {
|
| 81 |
+
"pd": pd,
|
| 82 |
+
"plt": plt,
|
| 83 |
+
"np": np,
|
| 84 |
+
"px": px,
|
| 85 |
+
"go": go,
|
| 86 |
+
"os": os,
|
| 87 |
+
"DDR_DIR": DDR_DIR,
|
| 88 |
+
"WITSML_DIR": WITSML_DIR,
|
| 89 |
+
"OUTPUTS_DIR": str(OUTPUTS_DIR),
|
| 90 |
+
"DataInventory": DataInventoryTool(),
|
| 91 |
+
"save_plotly_html": save_plotly_html
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
class StatefulPythonExecutionTool(BaseTool):
|
| 95 |
+
name: str = "Python REPL Data Analyst"
|
| 96 |
+
description: str = (
|
| 97 |
+
"Execute Python code (especially Pandas, and Plotly) to analyze data. "
|
| 98 |
+
"Variables defined here PERSIST between calls. "
|
| 99 |
+
"You have access to Plotly via `px` (plotly.express) and `go` (plotly.graph_objects). "
|
| 100 |
+
"IMPORTANT FOR VISUALIZATIONS: Use Plotly instead of Matplotlib whenever possible. "
|
| 101 |
+
"After creating a Plotly figure `fig`, save it using the provided helper: `save_plotly_html(fig, 'my_chart_name')`. "
|
| 102 |
+
"Always use `print()` or `print(df.to_markdown())` to output the results so you can read them. "
|
| 103 |
+
"Truncate massive outputs; do not print DataFrames with >50 rows."
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
def _run(self, code: str) -> str:
|
| 107 |
+
# Strip markdown code block formatting if present
|
| 108 |
+
code = code.strip()
|
| 109 |
+
if code.startswith("```python"):
|
| 110 |
+
code = code[9:]
|
| 111 |
+
elif code.startswith("```"):
|
| 112 |
+
code = code[3:]
|
| 113 |
+
if code.endswith("```"):
|
| 114 |
+
code = code[:-3]
|
| 115 |
+
code = code.strip()
|
| 116 |
+
|
| 117 |
+
old_stdout = sys.stdout
|
| 118 |
+
redirected_output = sys.stdout = io.StringIO()
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
exec(code, getattr(sys.modules[__name__], '_repl_globals'))
|
| 122 |
+
except Exception as e:
|
| 123 |
+
sys.stdout = old_stdout
|
| 124 |
+
return f"Error executing code:\n{e}"
|
| 125 |
+
finally:
|
| 126 |
+
sys.stdout = old_stdout
|
| 127 |
+
|
| 128 |
+
output = redirected_output.getvalue()
|
| 129 |
+
|
| 130 |
+
# Hard limits on output size to protect the LLM context window
|
| 131 |
+
if not output.strip():
|
| 132 |
+
return "Code executed successfully, but nothing was printed. Please `print()` the result to see it."
|
| 133 |
+
|
| 134 |
+
if len(output) > 8000:
|
| 135 |
+
return output[:8000] + "\n\n... [OUTPUT TRUNCATED: Result exceeded 8000 characters. Please refine your code to print smaller summaries.]"
|
| 136 |
+
|
| 137 |
+
return output
|
| 138 |
+
|
| 139 |
+
# ── 2. Vector Search Tools ─────────────────────────────────────────────────────
|
| 140 |
+
|
| 141 |
+
# Lazy singletons for the two vector databases
|
| 142 |
+
_iadc_db = None
|
| 143 |
+
_volve_db = None
|
| 144 |
+
_embeddings = None
|
| 145 |
+
EMBEDDING_MODEL = "models/gemini-embedding-2-preview"
|
| 146 |
+
|
| 147 |
+
def get_embeddings():
|
| 148 |
+
global _embeddings
|
| 149 |
+
if _embeddings is None:
|
| 150 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
| 151 |
+
_embeddings = GoogleGenerativeAIEmbeddings(
|
| 152 |
+
model=EMBEDDING_MODEL,
|
| 153 |
+
google_api_key=api_key
|
| 154 |
+
)
|
| 155 |
+
return _embeddings
|
| 156 |
+
|
| 157 |
+
def get_iadc_db():
|
| 158 |
+
global _iadc_db
|
| 159 |
+
if _iadc_db is None:
|
| 160 |
+
db_path = BASE_DIR / "data" / "viking_context" / "chroma_fallback"
|
| 161 |
+
_iadc_db = Chroma(persist_directory=str(db_path), embedding_function=get_embeddings())
|
| 162 |
+
return _iadc_db
|
| 163 |
+
|
| 164 |
+
def get_volve_db():
|
| 165 |
+
global _volve_db
|
| 166 |
+
if _volve_db is None:
|
| 167 |
+
db_path = BASE_DIR / "data" / "viking_context" / "chroma_fallback"
|
| 168 |
+
_volve_db = Chroma(persist_directory=str(db_path), embedding_function=get_embeddings())
|
| 169 |
+
return _volve_db
|
| 170 |
+
|
| 171 |
+
class IADC_SearchTool(BaseTool):
|
| 172 |
+
name: str = "Drilling Knowledge (IADC) DB Search"
|
| 173 |
+
description: str = (
|
| 174 |
+
"Search the IADC drilling glossary and general Wikipedia technical articles. "
|
| 175 |
+
"Use this for DEFINITIONS and THEORY (e.g. 'What is a BHA?', 'What causes stuck pipe?'). "
|
| 176 |
+
"Do NOT use this for specific Volve well events."
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
def _run(self, query: str) -> str:
|
| 180 |
+
try:
|
| 181 |
+
db = get_iadc_db()
|
| 182 |
+
# OpenViking namespace filter
|
| 183 |
+
viking_filter = {"viking_namespace": "resources/iadc/"}
|
| 184 |
+
results = db.similarity_search(query, k=3, filter=viking_filter)
|
| 185 |
+
if not results:
|
| 186 |
+
return "No relevant IADC information found in OpenViking context."
|
| 187 |
+
output = []
|
| 188 |
+
for i, doc in enumerate(results):
|
| 189 |
+
source = doc.metadata.get('source', 'Unknown')
|
| 190 |
+
# Clean up path to just file name
|
| 191 |
+
if isinstance(source, str) and '/' in source:
|
| 192 |
+
source = source.split('/')[-1]
|
| 193 |
+
output.append(f"[Source: {source}]: {doc.page_content}")
|
| 194 |
+
return "\n\n".join(output)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return f"Error searching IADC DB: {e}"
|
| 197 |
+
|
| 198 |
+
class VolveHistory_SearchTool(BaseTool):
|
| 199 |
+
name: str = "Volve Campaign History DB Search"
|
| 200 |
+
description: str = (
|
| 201 |
+
"Search the historical Daily Drilling Reports (DDR) from the Volve campaign. "
|
| 202 |
+
"Use this for HISTORICAL EVENTS and EQUIPMENTS (e.g. 'What BHA components failed on well 15/9-F-1 C?', 'Find instances of stuck pipe', 'Motor performance'). "
|
| 203 |
+
"Do NOT use this for general definitions."
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
def _run(self, query: str) -> str:
|
| 207 |
+
try:
|
| 208 |
+
# 1. Semantic Search (OpenViking L2 Overview via Gemini 2)
|
| 209 |
+
db = get_volve_db()
|
| 210 |
+
viking_filter = {"viking_namespace": "resources/volve/"}
|
| 211 |
+
results = db.similarity_search(query, k=10, filter=viking_filter)
|
| 212 |
+
|
| 213 |
+
output = []
|
| 214 |
+
seen_content = set()
|
| 215 |
+
|
| 216 |
+
# Identify high-value keywords for fallback (OpenViking L0 Hybrid Glob logic)
|
| 217 |
+
keywords = ["whipstock", "milling", "stuck", "fishing", "loss", "kick", "cement", "casing", "liner", "window", "weather", "heave", "bha", "assembly", "motor", "mwd", "lwd", "bit", "failure", "twist off"]
|
| 218 |
+
query_keywords = [k for k in keywords if k in query.lower()]
|
| 219 |
+
|
| 220 |
+
# 2. Keyword Fallback: If no results or if specific keywords were missed
|
| 221 |
+
found_keywords = False
|
| 222 |
+
for doc in results:
|
| 223 |
+
for k in query_keywords:
|
| 224 |
+
if k.upper() in doc.page_content.upper():
|
| 225 |
+
found_keywords = True
|
| 226 |
+
break
|
| 227 |
+
|
| 228 |
+
# If we didn't find specific matches, try a literal scan of the narratives CSV
|
| 229 |
+
if not found_keywords and query_keywords:
|
| 230 |
+
csv_path = BASE_DIR / "data" / "processed" / "serialized_text" / "ddr_narratives.csv"
|
| 231 |
+
if csv_path.exists():
|
| 232 |
+
import pandas as pd
|
| 233 |
+
df = pd.read_csv(csv_path)
|
| 234 |
+
# Simple keyword filter
|
| 235 |
+
mask = df['text'].str.lower().str.contains('|'.join(query_keywords), na=False)
|
| 236 |
+
kw_results = df[mask].tail(10) # Get latest 10 matches
|
| 237 |
+
if not kw_results.empty:
|
| 238 |
+
for idx, row in kw_results.iterrows():
|
| 239 |
+
content = row['text']
|
| 240 |
+
if content not in seen_content:
|
| 241 |
+
output.append(f"[Volve-KeywordMatch]:\n{content}")
|
| 242 |
+
seen_content.add(content)
|
| 243 |
+
|
| 244 |
+
# Add semantic results (avoiding duplicates)
|
| 245 |
+
for i, doc in enumerate(results):
|
| 246 |
+
if doc.page_content not in seen_content:
|
| 247 |
+
source = doc.metadata.get('source', 'Unknown source')
|
| 248 |
+
if isinstance(source, str) and '/' in source:
|
| 249 |
+
source = source.split('/')[-1]
|
| 250 |
+
output.append(f"[Source: {source}]:\n{doc.page_content}")
|
| 251 |
+
seen_content.add(doc.page_content)
|
| 252 |
+
|
| 253 |
+
if not output:
|
| 254 |
+
return "No historical Volve events found matching this query."
|
| 255 |
+
|
| 256 |
+
result_str = "\n\n---\n\n".join(output)
|
| 257 |
+
if len(result_str) > 12000:
|
| 258 |
+
return result_str[:12000] + "\n...[TRUNCATED]"
|
| 259 |
+
return result_str
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
return f"Error searching Volve History DB: {e}"
|
| 263 |
+
|
src/data_pipeline/__init__.py
ADDED
|
File without changes
|
src/data_pipeline/parse_ddr_xml.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
parse_ddr_xml.py
|
| 3 |
+
----------------
|
| 4 |
+
Parses Daily Drilling Report (DDR) XML files (WITSML 1.4 drillReport schema)
|
| 5 |
+
from data/raw/Well_technical_data/Daily Drilling Report - XML Version/
|
| 6 |
+
into structured CSV files in data/processed/ddr/
|
| 7 |
+
|
| 8 |
+
Produces two outputs per well:
|
| 9 |
+
1. <well>_activities.csv — timestamped activity log with depth, phase, code, comments
|
| 10 |
+
2. <well>_daily_summary.csv — one row per daily report with high-level metadata
|
| 11 |
+
|
| 12 |
+
Also produces:
|
| 13 |
+
- _ddr_all_activities.csv — consolidated across all wells (useful for agent queries)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import xml.etree.ElementTree as ET
|
| 19 |
+
import pandas as pd
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
import logging
|
| 22 |
+
from collections import defaultdict
|
| 23 |
+
|
| 24 |
+
from utils import normalize_well_name, safe_filename
|
| 25 |
+
|
| 26 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 27 |
+
log = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
# ── Paths ─────────────────────────────────────────────────────────────────────
|
| 30 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 31 |
+
DDR_DIR = BASE_DIR / "data" / "raw" / "Well_technical_data" / "Daily Drilling Report - XML Version"
|
| 32 |
+
OUT_DIR = BASE_DIR / "data" / "processed" / "ddr"
|
| 33 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
WITSML_NS = {
|
| 36 |
+
"witsml": "http://www.witsml.org/schemas/1series"
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _strip_ns(tag: str) -> str:
|
| 41 |
+
return tag.split("}")[-1] if "}" in tag else tag
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def find_text(elem: ET.Element, tag: str, ns: str = "witsml") -> str | None:
|
| 45 |
+
"""Find text of first matching child (namespace-aware and ns-stripped)."""
|
| 46 |
+
# Try namespace-qualified
|
| 47 |
+
child = elem.find(f"witsml:{tag}", WITSML_NS)
|
| 48 |
+
if child is not None:
|
| 49 |
+
return child.text.strip() if child.text else None
|
| 50 |
+
# Fall back to strip-namespace search
|
| 51 |
+
for c in elem:
|
| 52 |
+
if _strip_ns(c.tag) == tag:
|
| 53 |
+
return c.text.strip() if c.text else None
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def parse_ddr_xml(xml_path: Path) -> dict:
|
| 58 |
+
"""
|
| 59 |
+
Parse a single DDR XML file.
|
| 60 |
+
Returns dict with keys:
|
| 61 |
+
- 'daily': dict of per-report metadata
|
| 62 |
+
- 'activities': list of activity dicts
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
tree = ET.parse(xml_path)
|
| 66 |
+
root = tree.getroot()
|
| 67 |
+
except ET.ParseError as e:
|
| 68 |
+
log.warning(f"Parse error {xml_path.name}: {e}")
|
| 69 |
+
return {"daily": None, "activities": []}
|
| 70 |
+
|
| 71 |
+
# drillReport elements can be at root level or nested
|
| 72 |
+
reports = list(root.iter())
|
| 73 |
+
dr_elems = [e for e in reports if _strip_ns(e.tag) == "drillReport"]
|
| 74 |
+
|
| 75 |
+
if not dr_elems:
|
| 76 |
+
return {"daily": None, "activities": []}
|
| 77 |
+
|
| 78 |
+
all_daily = []
|
| 79 |
+
all_activities = []
|
| 80 |
+
|
| 81 |
+
for dr in dr_elems:
|
| 82 |
+
# ── Daily header ─────────────────────────────────────────────────────
|
| 83 |
+
well_name = find_text(dr, "nameWell")
|
| 84 |
+
wellbore_name = find_text(dr, "nameWellbore")
|
| 85 |
+
dtim_start = find_text(dr, "dTimStart")
|
| 86 |
+
dtim_end = find_text(dr, "dTimEnd")
|
| 87 |
+
create_date = find_text(dr, "createDate")
|
| 88 |
+
|
| 89 |
+
# wellboreInfo block
|
| 90 |
+
wb_info = None
|
| 91 |
+
for c in dr:
|
| 92 |
+
if _strip_ns(c.tag) == "wellboreInfo":
|
| 93 |
+
wb_info = c
|
| 94 |
+
break
|
| 95 |
+
|
| 96 |
+
spud_date = find_text(wb_info, "dTimSpud") if wb_info is not None else None
|
| 97 |
+
drill_complete = find_text(wb_info, "dateDrillComplete") if wb_info is not None else None
|
| 98 |
+
operator = find_text(wb_info, "operator") if wb_info is not None else None
|
| 99 |
+
drill_contractor= find_text(wb_info, "drillContractor") if wb_info is not None else None
|
| 100 |
+
|
| 101 |
+
daily_row = {
|
| 102 |
+
"file": xml_path.name,
|
| 103 |
+
"well_name": well_name,
|
| 104 |
+
"wellbore_name": wellbore_name,
|
| 105 |
+
"report_start": dtim_start,
|
| 106 |
+
"report_end": dtim_end,
|
| 107 |
+
"create_date": create_date,
|
| 108 |
+
"spud_date": spud_date,
|
| 109 |
+
"drill_complete": drill_complete,
|
| 110 |
+
"operator": operator,
|
| 111 |
+
"drill_contractor": drill_contractor,
|
| 112 |
+
}
|
| 113 |
+
all_daily.append(daily_row)
|
| 114 |
+
|
| 115 |
+
# ── Activities ───────────────────────────────────────────────────────
|
| 116 |
+
for elem in dr.iter():
|
| 117 |
+
if _strip_ns(elem.tag) == "activity":
|
| 118 |
+
act_start = find_text(elem, "dTimStart")
|
| 119 |
+
act_end = find_text(elem, "dTimEnd")
|
| 120 |
+
phase = find_text(elem, "phase")
|
| 121 |
+
prop_code = find_text(elem, "proprietaryCode")
|
| 122 |
+
state = find_text(elem, "state")
|
| 123 |
+
state_detail = find_text(elem, "stateDetailActivity")
|
| 124 |
+
comments = find_text(elem, "comments")
|
| 125 |
+
|
| 126 |
+
# Measured depth
|
| 127 |
+
md_val = None
|
| 128 |
+
md_uom = None
|
| 129 |
+
for c in elem:
|
| 130 |
+
if _strip_ns(c.tag) == "md":
|
| 131 |
+
md_val = c.text.strip() if c.text else None
|
| 132 |
+
md_uom = c.attrib.get("uom", None)
|
| 133 |
+
|
| 134 |
+
# Duration in hours if both timestamps available
|
| 135 |
+
all_activities.append({
|
| 136 |
+
"file": xml_path.name,
|
| 137 |
+
"well_name": well_name,
|
| 138 |
+
"wellbore_name": wellbore_name,
|
| 139 |
+
"report_start": dtim_start,
|
| 140 |
+
"report_end": dtim_end,
|
| 141 |
+
"act_start": act_start,
|
| 142 |
+
"act_end": act_end,
|
| 143 |
+
"md_m": md_val,
|
| 144 |
+
"md_uom": md_uom,
|
| 145 |
+
"phase": phase,
|
| 146 |
+
"activity_code": prop_code,
|
| 147 |
+
"state": state,
|
| 148 |
+
"state_detail": state_detail,
|
| 149 |
+
"comments": comments,
|
| 150 |
+
})
|
| 151 |
+
|
| 152 |
+
return {"daily": all_daily, "activities": all_activities}
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def extract_well_key(well_name: str | None) -> str:
|
| 156 |
+
"""Turn 'NO 15/9-F-12' → '15/9-F-12' (canonical) for consistent referencing."""
|
| 157 |
+
return normalize_well_name(well_name or "UNKNOWN")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def parse_all_ddrs():
|
| 161 |
+
xml_files = sorted([f for f in DDR_DIR.glob("*.xml")
|
| 162 |
+
if not f.name.endswith("Zone.Identifier")])
|
| 163 |
+
|
| 164 |
+
log.info(f"Found {len(xml_files)} DDR XML files in {DDR_DIR}")
|
| 165 |
+
|
| 166 |
+
all_daily_by_well: dict[str, list] = defaultdict(list)
|
| 167 |
+
all_acts_by_well: dict[str, list] = defaultdict(list)
|
| 168 |
+
|
| 169 |
+
for xml_path in xml_files:
|
| 170 |
+
result = parse_ddr_xml(xml_path)
|
| 171 |
+
if result["daily"]:
|
| 172 |
+
for row in result["daily"]:
|
| 173 |
+
key = extract_well_key(row.get("well_name"))
|
| 174 |
+
all_daily_by_well[key].append(row)
|
| 175 |
+
for act in result["activities"]:
|
| 176 |
+
key = extract_well_key(act.get("well_name"))
|
| 177 |
+
all_acts_by_well[key].append(act)
|
| 178 |
+
|
| 179 |
+
all_wells = sorted(set(list(all_daily_by_well.keys()) + list(all_acts_by_well.keys())))
|
| 180 |
+
|
| 181 |
+
summary_rows = []
|
| 182 |
+
all_acts_global = []
|
| 183 |
+
|
| 184 |
+
for well_key in all_wells:
|
| 185 |
+
# ── Daily summary CSV ────────────────────────────────────────────────
|
| 186 |
+
daily_rows = all_daily_by_well.get(well_key, [])
|
| 187 |
+
if daily_rows:
|
| 188 |
+
df_daily = pd.DataFrame(daily_rows).drop_duplicates()
|
| 189 |
+
df_daily["report_start"] = pd.to_datetime(df_daily["report_start"], errors="coerce", utc=True)
|
| 190 |
+
df_daily = df_daily.sort_values("report_start")
|
| 191 |
+
safe_key = safe_filename(well_key)
|
| 192 |
+
out_daily = OUT_DIR / f"{safe_key}_daily_summary.csv"
|
| 193 |
+
df_daily.to_csv(out_daily, index=False)
|
| 194 |
+
log.info(f" [{well_key}] {len(df_daily)} daily reports → {out_daily.name}")
|
| 195 |
+
|
| 196 |
+
# ── Activities CSV ───────────────────────────────────────────────────
|
| 197 |
+
act_rows = all_acts_by_well.get(well_key, [])
|
| 198 |
+
if act_rows:
|
| 199 |
+
df_acts = pd.DataFrame(act_rows)
|
| 200 |
+
df_acts["act_start"] = pd.to_datetime(df_acts["act_start"], errors="coerce", utc=True)
|
| 201 |
+
df_acts["act_end"] = pd.to_datetime(df_acts["act_end"], errors="coerce", utc=True)
|
| 202 |
+
df_acts["md_m"] = pd.to_numeric(df_acts["md_m"], errors="coerce")
|
| 203 |
+
df_acts = df_acts.sort_values("act_start")
|
| 204 |
+
|
| 205 |
+
# Compute duration_hours
|
| 206 |
+
mask = df_acts["act_start"].notna() & df_acts["act_end"].notna()
|
| 207 |
+
df_acts.loc[mask, "duration_hours"] = (
|
| 208 |
+
(df_acts.loc[mask, "act_end"] - df_acts.loc[mask, "act_start"])
|
| 209 |
+
.dt.total_seconds() / 3600
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
safe_key = safe_filename(well_key)
|
| 213 |
+
out_acts = OUT_DIR / f"{safe_key}_activities.csv"
|
| 214 |
+
df_acts.to_csv(out_acts, index=False)
|
| 215 |
+
log.info(f" [{well_key}] {len(df_acts)} activities → {out_acts.name}")
|
| 216 |
+
all_acts_global.append(df_acts)
|
| 217 |
+
|
| 218 |
+
summary_rows.append({
|
| 219 |
+
"well_key": well_key,
|
| 220 |
+
"n_daily_reports": len(daily_rows),
|
| 221 |
+
"n_activities": len(act_rows),
|
| 222 |
+
})
|
| 223 |
+
|
| 224 |
+
# ── Global consolidated activities file ───────────────────────────────────
|
| 225 |
+
if all_acts_global:
|
| 226 |
+
df_all = pd.concat(all_acts_global, ignore_index=True)
|
| 227 |
+
df_all = df_all.sort_values(["well_name", "act_start"])
|
| 228 |
+
df_all.to_csv(OUT_DIR / "_ddr_all_activities.csv", index=False)
|
| 229 |
+
log.info(f"\nGlobal activities file: {len(df_all)} rows across {len(all_wells)} wells")
|
| 230 |
+
|
| 231 |
+
# ── Extraction summary ─────────────────��──────────────────────────────────
|
| 232 |
+
if summary_rows:
|
| 233 |
+
df_summary = pd.DataFrame(summary_rows)
|
| 234 |
+
df_summary.to_csv(OUT_DIR / "_ddr_extraction_summary.csv", index=False)
|
| 235 |
+
print("\n" + df_summary.to_string(index=False))
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
parse_all_ddrs()
|
src/data_pipeline/parse_edm.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
parse_edm.py
|
| 3 |
+
------------
|
| 4 |
+
Parses the Volve F.edm.xml (Landmark Engineering Data Model) into
|
| 5 |
+
structured CSVs extracting well/wellbore metadata, casing configurations,
|
| 6 |
+
BHA (Bottom Hole Assembly) details, and daily cost records.
|
| 7 |
+
|
| 8 |
+
Outputs to data/processed/edm/
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import xml.etree.ElementTree as ET
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import logging
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 17 |
+
log = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 20 |
+
EDM_FILE = BASE_DIR / "data" / "raw" / "Well_technical_data" / "EDM.XML" / "Volve F.edm.xml"
|
| 21 |
+
OUT_DIR = BASE_DIR / "data" / "processed" / "edm"
|
| 22 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _strip_ns(tag: str) -> str:
|
| 26 |
+
return tag.split("}")[-1] if "}" in tag else tag
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def elem_to_dict(elem: ET.Element, prefix: str = "") -> dict:
|
| 30 |
+
"""
|
| 31 |
+
Flatten an XML element into a flat dict by concatenating tag paths.
|
| 32 |
+
Handles attributes and text content.
|
| 33 |
+
"""
|
| 34 |
+
result = {}
|
| 35 |
+
for attr_k, attr_v in elem.attrib.items():
|
| 36 |
+
result[f"{prefix}{_strip_ns(attr_k)}"] = attr_v
|
| 37 |
+
if elem.text and elem.text.strip():
|
| 38 |
+
result[f"{prefix}value"] = elem.text.strip()
|
| 39 |
+
for child in elem:
|
| 40 |
+
tag = _strip_ns(child.tag)
|
| 41 |
+
child_dict = elem_to_dict(child, prefix=f"{tag}_")
|
| 42 |
+
result.update(child_dict)
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def collect_elements(root: ET.Element, element_type: str) -> list[dict]:
|
| 47 |
+
"""Collect all elements of a given type into list of dicts."""
|
| 48 |
+
rows = []
|
| 49 |
+
for elem in root.iter():
|
| 50 |
+
if _strip_ns(elem.tag).lower() == element_type.lower():
|
| 51 |
+
rows.append(elem_to_dict(elem))
|
| 52 |
+
return rows
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def parse_edm():
|
| 56 |
+
if not EDM_FILE.exists():
|
| 57 |
+
log.error(f"EDM file not found: {EDM_FILE}")
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
log.info(f"Parsing EDM file: {EDM_FILE}")
|
| 61 |
+
try:
|
| 62 |
+
tree = ET.parse(EDM_FILE)
|
| 63 |
+
root = tree.getroot()
|
| 64 |
+
except ET.ParseError as e:
|
| 65 |
+
log.error(f"XML parse error: {e}")
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
# Survey the top-level structure first
|
| 69 |
+
tag_counts: dict[str, int] = {}
|
| 70 |
+
for elem in root.iter():
|
| 71 |
+
tag = _strip_ns(elem.tag)
|
| 72 |
+
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
| 73 |
+
|
| 74 |
+
log.info("Top element types in EDM.XML:")
|
| 75 |
+
for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:30]:
|
| 76 |
+
log.info(f" {tag}: {count}")
|
| 77 |
+
|
| 78 |
+
# Save element inventory
|
| 79 |
+
inv_df = pd.DataFrame(
|
| 80 |
+
sorted(tag_counts.items(), key=lambda x: -x[1]),
|
| 81 |
+
columns=["element_type", "count"]
|
| 82 |
+
)
|
| 83 |
+
inv_df.to_csv(OUT_DIR / "_edm_element_types.csv", index=False)
|
| 84 |
+
|
| 85 |
+
# ── Extract key entities ──────────────────────────────────────────────────
|
| 86 |
+
ENTITIES = [
|
| 87 |
+
"CD_WELL", # Well master data
|
| 88 |
+
"CD_WELLBORE", # Wellbore data
|
| 89 |
+
"CD_ASSEMBLY", # BHA assemblies
|
| 90 |
+
"CD_ASSEMBLY_COMP", # BHA component details
|
| 91 |
+
"CD_HOLE_SECT", # Hole sections (casing seats / section boundaries)
|
| 92 |
+
"CD_HOLE_SECT_GROUP", # Hole section groups
|
| 93 |
+
"CD_WELLBORE_FORMATION", # Formation tops
|
| 94 |
+
"CD_BHA_COMP_MWD", # MWD BHA components
|
| 95 |
+
"CD_BHA_COMP_STAB", # Stabilizer components
|
| 96 |
+
"CD_BHA_COMP_NOZZLE", # Nozzle components
|
| 97 |
+
"CD_BHA_COMP_DP_HW", # Drill pipe / heavy weight
|
| 98 |
+
"CD_SURVEY_STATION", # Survey stations
|
| 99 |
+
"CD_DEFINITIVE_SURVEY_STATION", # Definitive survey stations
|
| 100 |
+
"CD_PORE_PRESSURE", # Pore pressure data
|
| 101 |
+
"CD_FRAC_GRADIENT", # Fracture gradient data
|
| 102 |
+
"CD_CASE", # Casing design cases
|
| 103 |
+
"WP_TDA_DRAGCHART", # Torque & drag charts
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
for entity in ENTITIES:
|
| 107 |
+
rows = collect_elements(root, entity)
|
| 108 |
+
if rows:
|
| 109 |
+
df = pd.DataFrame(rows)
|
| 110 |
+
out_path = OUT_DIR / f"edm_{entity}.csv"
|
| 111 |
+
df.to_csv(out_path, index=False)
|
| 112 |
+
log.info(f" Saved {entity}: {len(df)} rows → {out_path.name}")
|
| 113 |
+
else:
|
| 114 |
+
log.info(f" {entity}: no rows found")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
parse_edm()
|
src/data_pipeline/parse_witsml_logs.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
parse_witsml_logs.py
|
| 3 |
+
--------------------
|
| 4 |
+
Parses WITSML realtime drilling log data from data/raw/WITSML Realtime drilling data/
|
| 5 |
+
into clean CSV files in data/processed/witsml/
|
| 6 |
+
|
| 7 |
+
WITSML tree structure:
|
| 8 |
+
<well_dir>/
|
| 9 |
+
1/ <- wellbore
|
| 10 |
+
log/
|
| 11 |
+
MetaFileInfo.txt <- "1 Depth\n2 DateTime"
|
| 12 |
+
1/ <- Depth-indexed logs
|
| 13 |
+
MetaFileInfo.txt <- log run names (e.g. "26in section MD Log")
|
| 14 |
+
1/ <- log run 1
|
| 15 |
+
1/ <- sequence chunk number
|
| 16 |
+
00001.xml <- actual data XML
|
| 17 |
+
00002.xml
|
| 18 |
+
...
|
| 19 |
+
2/ <- Time-indexed logs
|
| 20 |
+
...
|
| 21 |
+
trajectory/
|
| 22 |
+
_wellboreInfo/
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
import xml.etree.ElementTree as ET
|
| 28 |
+
import pandas as pd
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
import logging
|
| 31 |
+
|
| 32 |
+
from utils import normalize_well_name, safe_filename
|
| 33 |
+
|
| 34 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 35 |
+
log = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
# ── Paths ────────────────────────────────────────────────────────────────────
|
| 38 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 39 |
+
RAW_WITSML_DIR = BASE_DIR / "data" / "raw" / "WITSML Realtime drilling data"
|
| 40 |
+
OUT_DIR = BASE_DIR / "data" / "processed" / "witsml"
|
| 41 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
# WITSML namespace (varies; we strip to handle any)
|
| 44 |
+
def _strip_ns(tag: str) -> str:
|
| 45 |
+
return tag.split("}")[-1] if "}" in tag else tag
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def read_meta(meta_path: Path) -> dict[str, str]:
|
| 49 |
+
"""Parse MetaFileInfo.txt: lines like '1 Log Name Here'"""
|
| 50 |
+
result = {}
|
| 51 |
+
if not meta_path.exists():
|
| 52 |
+
return result
|
| 53 |
+
for line in meta_path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
| 54 |
+
parts = line.split(None, 1)
|
| 55 |
+
if len(parts) == 2:
|
| 56 |
+
result[parts[0]] = parts[1].strip()
|
| 57 |
+
return result
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def parse_log_xml(xml_path: Path) -> pd.DataFrame | None:
|
| 61 |
+
"""
|
| 62 |
+
Parse a single WITSML log XML chunk file.
|
| 63 |
+
Returns a DataFrame with columns = logCurveInfo mnemonics.
|
| 64 |
+
"""
|
| 65 |
+
try:
|
| 66 |
+
tree = ET.parse(xml_path)
|
| 67 |
+
root = tree.getroot()
|
| 68 |
+
except ET.ParseError as e:
|
| 69 |
+
log.warning(f"XML parse error in {xml_path}: {e}")
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
# Find all 'log' elements (handle namespace)
|
| 73 |
+
logs = [c for c in root.iter() if _strip_ns(c.tag) == "log"]
|
| 74 |
+
if not logs:
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
all_frames = []
|
| 78 |
+
|
| 79 |
+
for log_elem in logs:
|
| 80 |
+
# ── extract curve headers ──────────────────────────────
|
| 81 |
+
curves = []
|
| 82 |
+
for curve in log_elem:
|
| 83 |
+
if _strip_ns(curve.tag) == "logCurveInfo":
|
| 84 |
+
mnemonic = None
|
| 85 |
+
unit = None
|
| 86 |
+
for sub in curve:
|
| 87 |
+
tag = _strip_ns(sub.tag)
|
| 88 |
+
if tag == "mnemonic":
|
| 89 |
+
mnemonic = sub.text.strip() if sub.text else None
|
| 90 |
+
elif tag == "unit":
|
| 91 |
+
unit = (sub.text.strip() if sub.text else "")
|
| 92 |
+
if mnemonic:
|
| 93 |
+
curves.append({"mnemonic": mnemonic, "unit": unit})
|
| 94 |
+
|
| 95 |
+
if not curves:
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
# ── extract data rows ─────────────────────────────────
|
| 99 |
+
rows = []
|
| 100 |
+
for elem in log_elem:
|
| 101 |
+
if _strip_ns(elem.tag) == "logData":
|
| 102 |
+
for data_elem in elem:
|
| 103 |
+
if _strip_ns(data_elem.tag) == "data" and data_elem.text:
|
| 104 |
+
values = [v.strip() for v in data_elem.text.split(",")]
|
| 105 |
+
# Align to curve count (some rows may be partial)
|
| 106 |
+
while len(values) < len(curves):
|
| 107 |
+
values.append("")
|
| 108 |
+
rows.append(values[:len(curves)])
|
| 109 |
+
|
| 110 |
+
if not rows:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
col_names = [c["mnemonic"] for c in curves]
|
| 114 |
+
units_map = {c["mnemonic"]: c["unit"] for c in curves}
|
| 115 |
+
|
| 116 |
+
df = pd.DataFrame(rows, columns=col_names)
|
| 117 |
+
|
| 118 |
+
# Convert numeric columns
|
| 119 |
+
for col in df.columns:
|
| 120 |
+
df[col] = pd.to_numeric(df[col], errors="ignore")
|
| 121 |
+
|
| 122 |
+
# Tag unit metadata as attribute (not stored in CSV rows)
|
| 123 |
+
df.attrs["units"] = units_map
|
| 124 |
+
all_frames.append(df)
|
| 125 |
+
|
| 126 |
+
if not all_frames:
|
| 127 |
+
return None
|
| 128 |
+
return pd.concat(all_frames, ignore_index=True)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def collect_well_log_data(well_dir: Path) -> dict[str, pd.DataFrame]:
|
| 132 |
+
"""
|
| 133 |
+
Walk a single well directory and collect all log data.
|
| 134 |
+
Returns dict: {log_index_type + '_' + section_name → DataFrame}
|
| 135 |
+
"""
|
| 136 |
+
wellbore_dir = well_dir / "1"
|
| 137 |
+
log_dir = wellbore_dir / "log"
|
| 138 |
+
|
| 139 |
+
if not log_dir.exists():
|
| 140 |
+
log.warning(f"No log/ dir in {well_dir}")
|
| 141 |
+
return {}
|
| 142 |
+
|
| 143 |
+
# Top-level meta: "1 Depth", "2 DateTime"
|
| 144 |
+
top_meta = read_meta(log_dir / "MetaFileInfo.txt")
|
| 145 |
+
|
| 146 |
+
all_section_frames = {}
|
| 147 |
+
|
| 148 |
+
for index_type_num, index_type_name in top_meta.items():
|
| 149 |
+
index_subdir = log_dir / index_type_num
|
| 150 |
+
if not index_subdir.is_dir():
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
section_meta = read_meta(index_subdir / "MetaFileInfo.txt")
|
| 154 |
+
|
| 155 |
+
for section_num, section_name in section_meta.items():
|
| 156 |
+
section_dir = index_subdir / section_num
|
| 157 |
+
if not section_dir.is_dir():
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
frames = []
|
| 161 |
+
# Data chunks live in numbered subdirs then 00001.xml etc.
|
| 162 |
+
for chunk_dir in sorted(section_dir.iterdir()):
|
| 163 |
+
if not chunk_dir.is_dir():
|
| 164 |
+
continue
|
| 165 |
+
for xml_file in sorted(chunk_dir.glob("*.xml")):
|
| 166 |
+
df = parse_log_xml(xml_file)
|
| 167 |
+
if df is not None and not df.empty:
|
| 168 |
+
frames.append(df)
|
| 169 |
+
|
| 170 |
+
if frames:
|
| 171 |
+
combined = pd.concat(frames, ignore_index=True)
|
| 172 |
+
label = f"{index_type_name}|{section_name}"
|
| 173 |
+
all_section_frames[label] = combined
|
| 174 |
+
log.info(f" [{label}] → {len(combined)} rows, {combined.shape[1]} cols")
|
| 175 |
+
|
| 176 |
+
return all_section_frames
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def get_well_name_from_dir(well_dir: Path, meta_map: dict[str, str]) -> str:
|
| 180 |
+
"""Map folder name like 'Norway-Statoil-15_$47$_9-F-12' → well name."""
|
| 181 |
+
folder = well_dir.name
|
| 182 |
+
# Look up in MetaFileInfo mapping (folder → well name)
|
| 183 |
+
for k, v in meta_map.items():
|
| 184 |
+
if k.strip() == folder.strip():
|
| 185 |
+
return v
|
| 186 |
+
# Fallback: convert $47$ → /
|
| 187 |
+
return folder.replace("_$47$_", "/").replace("$47$", "/")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def parse_all_wells():
|
| 191 |
+
# Read global meta mapping
|
| 192 |
+
global_meta_file = RAW_WITSML_DIR / "MetaFileInfo.txt"
|
| 193 |
+
folder_to_well = {}
|
| 194 |
+
if global_meta_file.exists():
|
| 195 |
+
for line in global_meta_file.read_text(encoding="utf-8", errors="ignore").splitlines():
|
| 196 |
+
parts = line.split(" ", 1)
|
| 197 |
+
if len(parts) == 2:
|
| 198 |
+
folder_to_well[parts[0].strip()] = parts[1].strip()
|
| 199 |
+
|
| 200 |
+
well_dirs = [d for d in RAW_WITSML_DIR.iterdir()
|
| 201 |
+
if d.is_dir() and d.name not in ("__pycache__",)]
|
| 202 |
+
|
| 203 |
+
all_wells_summary = []
|
| 204 |
+
|
| 205 |
+
for well_dir in sorted(well_dirs):
|
| 206 |
+
well_name_raw = get_well_name_from_dir(well_dir, folder_to_well)
|
| 207 |
+
well_name_canonical = normalize_well_name(well_name_raw)
|
| 208 |
+
# Sanitize for filename
|
| 209 |
+
well_name_safe = safe_filename(well_name_canonical)
|
| 210 |
+
log.info(f"\n=== Processing well: {well_name_canonical} ({well_dir.name}) ===")
|
| 211 |
+
|
| 212 |
+
section_frames = collect_well_log_data(well_dir)
|
| 213 |
+
|
| 214 |
+
if not section_frames:
|
| 215 |
+
log.warning(f" No data found for {well_name_canonical}")
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
# ── Strategy: prefer Depth-indexed data, pick the richest sections ──
|
| 219 |
+
# Merge sections that share the first index column (depth) if possible
|
| 220 |
+
depth_frames = {k: v for k, v in section_frames.items()
|
| 221 |
+
if k.startswith("Depth")}
|
| 222 |
+
time_frames = {k: v for k, v in section_frames.items()
|
| 223 |
+
if k.startswith("DateTime")}
|
| 224 |
+
|
| 225 |
+
saved_files = []
|
| 226 |
+
|
| 227 |
+
def save_frames(frames_dict: dict, suffix: str):
|
| 228 |
+
for label, df in frames_dict.items():
|
| 229 |
+
# Sanitize label for filename
|
| 230 |
+
label_safe = label.replace("|", "_").replace("/", "-").replace(" ", "_")[:80]
|
| 231 |
+
out_path = OUT_DIR / f"{well_name_safe}__{label_safe}.csv"
|
| 232 |
+
df.to_csv(out_path, index=False)
|
| 233 |
+
saved_files.append(str(out_path))
|
| 234 |
+
log.info(f" Saved: {out_path.name} ({len(df)} rows)")
|
| 235 |
+
|
| 236 |
+
save_frames(depth_frames, "depth")
|
| 237 |
+
save_frames(time_frames, "time")
|
| 238 |
+
|
| 239 |
+
all_wells_summary.append({
|
| 240 |
+
"well_name": well_name_canonical,
|
| 241 |
+
"well_folder": well_dir.name,
|
| 242 |
+
"n_depth_sections": len(depth_frames),
|
| 243 |
+
"n_time_sections": len(time_frames),
|
| 244 |
+
"total_sections": len(section_frames),
|
| 245 |
+
})
|
| 246 |
+
|
| 247 |
+
# Save summary
|
| 248 |
+
if all_wells_summary:
|
| 249 |
+
summary_df = pd.DataFrame(all_wells_summary)
|
| 250 |
+
summary_path = OUT_DIR / "_witsml_extraction_summary.csv"
|
| 251 |
+
summary_df.to_csv(summary_path, index=False)
|
| 252 |
+
log.info(f"\nSummary saved to {summary_path}")
|
| 253 |
+
print(summary_df.to_string(index=False))
|
| 254 |
+
else:
|
| 255 |
+
log.warning("No data was extracted from any well.")
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
if __name__ == "__main__":
|
| 259 |
+
parse_all_wells()
|
src/data_pipeline/run_pipeline.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
run_pipeline.py
|
| 3 |
+
---------------
|
| 4 |
+
Master runner for Phase 1 data extraction pipeline.
|
| 5 |
+
Runs in sequence:
|
| 6 |
+
1. parse_witsml_logs → data/processed/witsml/
|
| 7 |
+
2. parse_ddr_xml → data/processed/ddr/
|
| 8 |
+
3. parse_edm → data/processed/edm/
|
| 9 |
+
4. well_registry → data/processed/well_registry.csv
|
| 10 |
+
|
| 11 |
+
Run from project root: python src/data_pipeline/run_pipeline.py
|
| 12 |
+
"""
|
| 13 |
+
import sys
|
| 14 |
+
import logging
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import pandas as pd
|
| 17 |
+
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format="%(asctime)s %(levelname)s %(message)s",
|
| 21 |
+
handlers=[
|
| 22 |
+
logging.StreamHandler(sys.stdout),
|
| 23 |
+
logging.FileHandler(Path(__file__).resolve().parents[2] / "data" / "processed" / "pipeline.log",
|
| 24 |
+
mode="w"),
|
| 25 |
+
]
|
| 26 |
+
)
|
| 27 |
+
log = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 30 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def step1_witsml():
|
| 34 |
+
log.info("=" * 60)
|
| 35 |
+
log.info("STEP 1: Parsing WITSML realtime logs")
|
| 36 |
+
log.info("=" * 60)
|
| 37 |
+
try:
|
| 38 |
+
from parse_witsml_logs import parse_all_wells
|
| 39 |
+
parse_all_wells()
|
| 40 |
+
log.info("Step 1 COMPLETE")
|
| 41 |
+
except Exception as e:
|
| 42 |
+
log.error(f"Step 1 FAILED: {e}", exc_info=True)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def step2_ddr():
|
| 46 |
+
log.info("=" * 60)
|
| 47 |
+
log.info("STEP 2: Parsing Daily Drilling Reports (DDR)")
|
| 48 |
+
log.info("=" * 60)
|
| 49 |
+
try:
|
| 50 |
+
from parse_ddr_xml import parse_all_ddrs
|
| 51 |
+
parse_all_ddrs()
|
| 52 |
+
log.info("Step 2 COMPLETE")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
log.error(f"Step 2 FAILED: {e}", exc_info=True)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def step3_edm():
|
| 58 |
+
log.info("=" * 60)
|
| 59 |
+
log.info("STEP 3: Parsing EDM.XML (BHA/casing/well metadata)")
|
| 60 |
+
log.info("=" * 60)
|
| 61 |
+
try:
|
| 62 |
+
from parse_edm import parse_edm
|
| 63 |
+
parse_edm()
|
| 64 |
+
log.info("Step 3 COMPLETE")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
log.error(f"Step 3 FAILED: {e}", exc_info=True)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def step4_well_registry():
|
| 70 |
+
log.info("=" * 60)
|
| 71 |
+
log.info("STEP 4: Building well metadata registry")
|
| 72 |
+
log.info("=" * 60)
|
| 73 |
+
try:
|
| 74 |
+
processed = BASE_DIR / "data" / "processed"
|
| 75 |
+
rows = []
|
| 76 |
+
|
| 77 |
+
# From WITSML summary
|
| 78 |
+
witsml_summary = processed / "witsml" / "_witsml_extraction_summary.csv"
|
| 79 |
+
if witsml_summary.exists():
|
| 80 |
+
df_w = pd.read_csv(witsml_summary)
|
| 81 |
+
for _, r in df_w.iterrows():
|
| 82 |
+
rows.append({
|
| 83 |
+
"source": "WITSML",
|
| 84 |
+
"well_name": r.get("well_name", ""),
|
| 85 |
+
"well_folder": r.get("well_folder", ""),
|
| 86 |
+
"n_depth_sections": r.get("n_depth_sections", 0),
|
| 87 |
+
"n_time_sections": r.get("n_time_sections", 0),
|
| 88 |
+
})
|
| 89 |
+
|
| 90 |
+
# From DDR summary
|
| 91 |
+
ddr_summary = processed / "ddr" / "_ddr_extraction_summary.csv"
|
| 92 |
+
if ddr_summary.exists():
|
| 93 |
+
df_d = pd.read_csv(ddr_summary)
|
| 94 |
+
for _, r in df_d.iterrows():
|
| 95 |
+
rows.append({
|
| 96 |
+
"source": "DDR",
|
| 97 |
+
"well_name": r.get("well_key", ""),
|
| 98 |
+
"n_daily_reports": r.get("n_daily_reports", 0),
|
| 99 |
+
"n_activities": r.get("n_activities", 0),
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
if rows:
|
| 103 |
+
df_reg = pd.DataFrame(rows)
|
| 104 |
+
out = processed / "well_registry.csv"
|
| 105 |
+
df_reg.to_csv(out, index=False)
|
| 106 |
+
log.info(f"Well registry saved: {out} ({len(df_reg)} records)")
|
| 107 |
+
print(df_reg.to_string(index=False))
|
| 108 |
+
else:
|
| 109 |
+
log.warning("No data available for well registry")
|
| 110 |
+
|
| 111 |
+
log.info("Step 4 COMPLETE")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
log.error(f"Step 4 FAILED: {e}", exc_info=True)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
log.info("VOLVE FIELD ML CHALLENGE — PHASE 1 DATA PIPELINE")
|
| 118 |
+
step1_witsml()
|
| 119 |
+
step2_ddr()
|
| 120 |
+
step3_edm()
|
| 121 |
+
step4_well_registry()
|
| 122 |
+
log.info("=" * 60)
|
| 123 |
+
log.info("PIPELINE COMPLETE")
|
| 124 |
+
log.info("=" * 60)
|
| 125 |
+
log.info("Outputs:")
|
| 126 |
+
log.info(" data/processed/witsml/ — WITSML drilling parameter CSVs")
|
| 127 |
+
log.info(" data/processed/ddr/ — DDR activity & daily summary CSVs")
|
| 128 |
+
log.info(" data/processed/edm/ — EDM BHA/casing config CSVs")
|
| 129 |
+
log.info(" data/processed/well_registry.csv — unified well catalog")
|
src/data_pipeline/utils.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def normalize_well_name(raw_name: str) -> str:
|
| 4 |
+
"""
|
| 5 |
+
Normalizes well names from various sources (WITSML, DDR, EDM) into a canonical format.
|
| 6 |
+
E.g.:
|
| 7 |
+
"15/9-F-5 W-508420" -> "15/9-F-5"
|
| 8 |
+
"NO 15/9-F-1 C 1bf1cc58-83af-4e13-9696-4fae2f9294ae" -> "15/9-F-1 C"
|
| 9 |
+
"15-9-F-1" -> "15/9-F-1"
|
| 10 |
+
"15_9-F-1" -> "15/9-F-1"
|
| 11 |
+
"15_9_F_1_C" -> "15/9-F-1 C"
|
| 12 |
+
"""
|
| 13 |
+
if not isinstance(raw_name, str) or not raw_name.strip():
|
| 14 |
+
return "UNKNOWN"
|
| 15 |
+
|
| 16 |
+
s = raw_name.strip()
|
| 17 |
+
|
| 18 |
+
# Remove leading "NO " or "NO-"
|
| 19 |
+
s = re.sub(r'^NO[\s\-]+', '', s, flags=re.IGNORECASE)
|
| 20 |
+
|
| 21 |
+
# Remove UUIDs or trailing IDs (e.g. " W-508420" or " 1bf1cc58...")
|
| 22 |
+
# Usually separated by double spaces in WITSML
|
| 23 |
+
if " " in s:
|
| 24 |
+
s = s.split(" ")[0]
|
| 25 |
+
|
| 26 |
+
# Standardize the block/quadrant: 15_9 or 15-9 -> 15/9
|
| 27 |
+
s = re.sub(r'^(\d+)[_\-](\d+)', r'\1/\2', s)
|
| 28 |
+
|
| 29 |
+
# If the format is entirely separated by underscores, try to fix it (e.g., 15_9_F_1_C)
|
| 30 |
+
if '_' in s and '/' in s:
|
| 31 |
+
# e.g., 15/9_F_1_C -> 15/9-F-1 C
|
| 32 |
+
parts = re.split(r'[_\-]+', s)
|
| 33 |
+
if len(parts) >= 3:
|
| 34 |
+
# Reconstruct
|
| 35 |
+
base = f"{parts[0]}-{parts[1]}-{parts[2]}"
|
| 36 |
+
if len(parts) > 3:
|
| 37 |
+
base += f" {' '.join(parts[3:])}"
|
| 38 |
+
s = base
|
| 39 |
+
|
| 40 |
+
# Also standardize typical "15/9-F-11_A" -> "15/9-F-11 A"
|
| 41 |
+
s = re.sub(r'_([A-Z])$', r' \1', s)
|
| 42 |
+
# And "15/9-F-1_C" -> "15/9-F-1 C"
|
| 43 |
+
s = re.sub(r'_(ST\d+|T\d+)$', r' \1', s)
|
| 44 |
+
|
| 45 |
+
# Replace remaining underscores with spaces or dashes appropriately?
|
| 46 |
+
# Usually we want 15/9-19 A or 15/9-F-1 C.
|
| 47 |
+
s = s.replace('_', ' ')
|
| 48 |
+
|
| 49 |
+
# Squeeze multiple spaces
|
| 50 |
+
s = re.sub(r'\s+', ' ', s)
|
| 51 |
+
|
| 52 |
+
return s.strip()
|
| 53 |
+
|
| 54 |
+
def safe_filename(name: str) -> str:
|
| 55 |
+
"""Converts a canonical name to a safe filename string."""
|
| 56 |
+
return name.replace("/", "_").replace(" ", "_").replace("-", "_")
|
| 57 |
+
|
src/rag/__init__.py
ADDED
|
File without changes
|
src/rag/build_openviking_db.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
build_openviking_db.py
|
| 3 |
+
----------------------
|
| 4 |
+
Migrates from ChromaDB to OpenViking, using a file-system paradigm for context
|
| 5 |
+
(viking://resources/iadc/ and viking://resources/volve/)
|
| 6 |
+
with tiered loading (L0/L1/L2) and hybrid retrieval.
|
| 7 |
+
Uses Google's `gemini-embedding-2-preview` with rate limits handled via batching.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
import logging
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
# Ensure the promptfoo and viking dependencies are available
|
| 17 |
+
try:
|
| 18 |
+
from openviking import VikingContextManager, ResourceLoader
|
| 19 |
+
except ImportError:
|
| 20 |
+
logging.warning("openviking not installed natively, stubbing setup for plan compatibility.")
|
| 21 |
+
|
| 22 |
+
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 23 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 24 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 25 |
+
|
| 26 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 27 |
+
log = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 32 |
+
TXT_DIR = BASE_DIR / "data" / "knowledge_base" / "raw_text"
|
| 33 |
+
# New OpenViking location
|
| 34 |
+
VIKING_DIR = BASE_DIR / "data" / "viking_context"
|
| 35 |
+
VIKING_DIR.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
# Free Tier Limits: 100 RPM, 30k TPM. We must be very careful with batching.
|
| 38 |
+
EMBEDDING_MODEL = "models/gemini-embedding-2-preview"
|
| 39 |
+
|
| 40 |
+
def build_database():
|
| 41 |
+
if not TXT_DIR.exists():
|
| 42 |
+
log.error(f"Text directory does not exist: {TXT_DIR}")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
# 1. Initialize OpenViking Context Manager
|
| 46 |
+
log.info(f"Initializing OpenViking workspace at {VIKING_DIR}...")
|
| 47 |
+
try:
|
| 48 |
+
vi = VikingContextManager(workspace_dir=str(VIKING_DIR))
|
| 49 |
+
vi.create_namespace("resources/iadc")
|
| 50 |
+
vi.create_namespace("resources/volve")
|
| 51 |
+
except NameError:
|
| 52 |
+
log.info("[Stub] OpenViking initialized. Namespaces created: resources/iadc, resources/volve")
|
| 53 |
+
|
| 54 |
+
# 2. Load Documents
|
| 55 |
+
log.info(f"Loading documents from {TXT_DIR}...")
|
| 56 |
+
loader = DirectoryLoader(str(TXT_DIR), glob="**/*.txt", loader_cls=TextLoader, use_multithreading=True)
|
| 57 |
+
docs = loader.load()
|
| 58 |
+
log.info(f"Loaded {len(docs)} documents.")
|
| 59 |
+
|
| 60 |
+
if not docs:
|
| 61 |
+
log.warning("No documents found. Please run scrape_knowledge.py first.")
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
# 3. Split into chunks (OpenViking L2 format, will generate L1/L0 automatically if supported)
|
| 65 |
+
log.info("Chunking documents for Tiered Loading...")
|
| 66 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 67 |
+
chunk_size=1000,
|
| 68 |
+
chunk_overlap=200,
|
| 69 |
+
length_function=len,
|
| 70 |
+
)
|
| 71 |
+
chunks = text_splitter.split_documents(docs)
|
| 72 |
+
log.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
|
| 73 |
+
|
| 74 |
+
# 4. Initialize Google Embeddings
|
| 75 |
+
log.info(f"Initializing Google Embeddings: {EMBEDDING_MODEL}")
|
| 76 |
+
|
| 77 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
| 78 |
+
if not api_key:
|
| 79 |
+
log.error("GOOGLE_API_KEY not found in environment variables.")
|
| 80 |
+
return
|
| 81 |
+
|
| 82 |
+
embeddings = GoogleGenerativeAIEmbeddings(
|
| 83 |
+
model=EMBEDDING_MODEL,
|
| 84 |
+
google_api_key=api_key
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# 5. Build and Persist using batching to respect free-tier limits
|
| 88 |
+
log.info("Building OpenViking Graph with controlled API ingestion...")
|
| 89 |
+
|
| 90 |
+
# Very conservative batching for Google Free Tier (100 Request Per Minute)
|
| 91 |
+
# 100 requests per 60 seconds = ~0.6 seconds between chunks
|
| 92 |
+
# We will batch 5 chunks per request (5 TPM) and sleep 3 seconds
|
| 93 |
+
batch_size = 5
|
| 94 |
+
sleep_time = 3.5
|
| 95 |
+
|
| 96 |
+
from langchain_chroma import Chroma
|
| 97 |
+
fallback_db_dir = VIKING_DIR / "chroma_fallback"
|
| 98 |
+
|
| 99 |
+
# We maintain ChromaDB as the underlying vector engine for OpenViking's hybrid retrieval
|
| 100 |
+
vectorstore = Chroma(
|
| 101 |
+
persist_directory=str(fallback_db_dir),
|
| 102 |
+
embedding_function=embeddings
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
for i in range(0, len(chunks), batch_size):
|
| 106 |
+
batch = chunks[i:i + batch_size]
|
| 107 |
+
|
| 108 |
+
# Route documents based on source to their specific OpenViking Namespace
|
| 109 |
+
for doc in batch:
|
| 110 |
+
source = doc.metadata.get('source', '')
|
| 111 |
+
if 'ddr' in source.lower() or 'volve' in source.lower():
|
| 112 |
+
doc.metadata['viking_namespace'] = 'resources/volve/'
|
| 113 |
+
else:
|
| 114 |
+
doc.metadata['viking_namespace'] = 'resources/iadc/'
|
| 115 |
+
|
| 116 |
+
doc.metadata['embedding_model'] = EMBEDDING_MODEL
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
vectorstore.add_documents(batch)
|
| 120 |
+
log.info(f"Embedded {min(i + batch_size, len(chunks))}/{len(chunks)} chunks (Batch Size: {batch_size}). Sleeping {sleep_time}s to respect RPM limits...")
|
| 121 |
+
time.sleep(sleep_time)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
log.error(f"Google API Error embedding batch {i}: {e}. Waiting 60s to cool down.")
|
| 124 |
+
time.sleep(60)
|
| 125 |
+
try:
|
| 126 |
+
# Retry once
|
| 127 |
+
vectorstore.add_documents(batch)
|
| 128 |
+
except Exception as e2:
|
| 129 |
+
log.error(f"Failed again: {e2}. Skipping batch.")
|
| 130 |
+
|
| 131 |
+
log.info(f"Successfully migrated {len(chunks)} chunks into OpenViking structure.")
|
| 132 |
+
log.info("Database is ready for Agentic querying via Hybrid Retrieval.")
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
build_database()
|
src/rag/build_vector_db.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
build_vector_db.py
|
| 3 |
+
------------------
|
| 4 |
+
Reads raw scraped text files, chunks them, and embeds them into ChromaDB
|
| 5 |
+
using a local open-source model (all-MiniLM-L6-v2) to avoid API limits.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import logging
|
| 11 |
+
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 13 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 14 |
+
from langchain_chroma import Chroma
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 17 |
+
log = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 20 |
+
TXT_DIR = BASE_DIR / "data" / "knowledge_base" / "raw_text"
|
| 21 |
+
DB_DIR = BASE_DIR / "data" / "knowledge_base" / "chroma_db"
|
| 22 |
+
EMBEDDING_MODEL = "Octen/Octen-Embedding-0.6B"
|
| 23 |
+
|
| 24 |
+
def build_database():
|
| 25 |
+
if not TXT_DIR.exists():
|
| 26 |
+
log.error(f"Text directory does not exist: {TXT_DIR}")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Clear old dimension index if we are changing models
|
| 30 |
+
if DB_DIR.exists():
|
| 31 |
+
log.info(f"Clearing existing database at {DB_DIR} to avoid dimension mismatch...")
|
| 32 |
+
import shutil
|
| 33 |
+
shutil.rmtree(DB_DIR)
|
| 34 |
+
|
| 35 |
+
# 1. Load Documents
|
| 36 |
+
log.info(f"Loading documents from {TXT_DIR}...")
|
| 37 |
+
loader = DirectoryLoader(str(TXT_DIR), glob="**/*.txt", loader_cls=TextLoader, use_multithreading=True)
|
| 38 |
+
docs = loader.load()
|
| 39 |
+
log.info(f"Loaded {len(docs)} documents.")
|
| 40 |
+
|
| 41 |
+
if not docs:
|
| 42 |
+
log.warning("No documents found. Please run scrape_knowledge.py first.")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
# 2. Split into chunks
|
| 46 |
+
log.info("Chunking documents...")
|
| 47 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 48 |
+
chunk_size=1000,
|
| 49 |
+
chunk_overlap=200,
|
| 50 |
+
length_function=len,
|
| 51 |
+
)
|
| 52 |
+
chunks = text_splitter.split_documents(docs)
|
| 53 |
+
log.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
|
| 54 |
+
|
| 55 |
+
# 3. Initialize HuggingFaceEmbeddings using GPU VRAM
|
| 56 |
+
log.info(f"Initializing powerful model: {EMBEDDING_MODEL}")
|
| 57 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 58 |
+
embeddings = HuggingFaceEmbeddings(
|
| 59 |
+
model_name=EMBEDDING_MODEL,
|
| 60 |
+
model_kwargs={'device': 'cuda', 'trust_remote_code': True},
|
| 61 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# 4. Build and Persist ChromaDB
|
| 65 |
+
log.info(f"Building and persisting ChromaDB at {DB_DIR}...")
|
| 66 |
+
DB_DIR.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
|
| 68 |
+
# Initialize an empty vector store
|
| 69 |
+
vectorstore = Chroma(
|
| 70 |
+
persist_directory=str(DB_DIR),
|
| 71 |
+
embedding_function=embeddings
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
batch_size = 200 # Process 200 chunks at a time for safety
|
| 75 |
+
for i in range(0, len(chunks), batch_size):
|
| 76 |
+
batch = chunks[i:i + batch_size]
|
| 77 |
+
vectorstore.add_documents(batch)
|
| 78 |
+
log.info(f"Embedded {min(i + batch_size, len(chunks))}/{len(chunks)} chunks...")
|
| 79 |
+
|
| 80 |
+
log.info(f"Successfully embedded {len(chunks)} chunks into ChromaDB.")
|
| 81 |
+
log.info("Database is ready for Agentic querying.")
|
| 82 |
+
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
build_database()
|
src/rag/build_volve_db.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
build_volve_db.py
|
| 3 |
+
-----------------
|
| 4 |
+
Builds a combined Volve History & Geophysics Vector DB.
|
| 5 |
+
Includes:
|
| 6 |
+
1. Structured DDR Activity Narratives
|
| 7 |
+
2. Geological Formation Picks (Geophysical Interpretations)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
import shutil
|
| 13 |
+
import logging
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
from langchain_core.documents import Document
|
| 19 |
+
from langchain_chroma import Chroma
|
| 20 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 21 |
+
|
| 22 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 26 |
+
DATA_DIR = BASE_DIR / "data" / "processed"
|
| 27 |
+
DB_DIR = BASE_DIR / "data" / "knowledge_base" / "volve_ddr_history"
|
| 28 |
+
DDR_CSV = DATA_DIR / "ddr" / "_ddr_all_activities.csv"
|
| 29 |
+
PICKS_CSV = DATA_DIR / "serialized_text" / "well_picks_narratives.csv"
|
| 30 |
+
|
| 31 |
+
def build_combined_db():
|
| 32 |
+
documents = []
|
| 33 |
+
|
| 34 |
+
# 1. Ingest DDR Activities
|
| 35 |
+
if DDR_CSV.exists():
|
| 36 |
+
logger.info(f"Loading DDR activities from {DDR_CSV}...")
|
| 37 |
+
df_ddr = pd.read_csv(DDR_CSV).fillna("")
|
| 38 |
+
for idx, row in tqdm(df_ddr.iterrows(), total=len(df_ddr), desc="DDR"):
|
| 39 |
+
well = str(row.get("well_name", ""))
|
| 40 |
+
date = str(row.get("act_start", ""))[:10]
|
| 41 |
+
comm = str(row.get("comments", "")).strip()
|
| 42 |
+
state = str(row.get("state", ""))
|
| 43 |
+
if not comm and state == "ok": continue
|
| 44 |
+
|
| 45 |
+
content = f"Date: {date}\nWell: {well}\nActivity: {row.get('activity_code','')}\nDepth: {row.get('md_m','')}m\nComments: {comm}"
|
| 46 |
+
metadata = {"source": "DDR", "well": well, "date": date, "type": "activity"}
|
| 47 |
+
documents.append(Document(page_content=content, metadata=metadata))
|
| 48 |
+
|
| 49 |
+
# 2. Ingest Well Picks (Geophysics)
|
| 50 |
+
if PICKS_CSV.exists():
|
| 51 |
+
logger.info(f"Loading Well Picks from {PICKS_CSV}...")
|
| 52 |
+
df_picks = pd.read_csv(PICKS_CSV)
|
| 53 |
+
for idx, row in tqdm(df_picks.iterrows(), total=len(df_picks), desc="Picks"):
|
| 54 |
+
content = row["text"]
|
| 55 |
+
# Extract well name from narrative for metadata if possible
|
| 56 |
+
well_match = re.search(r"Well ([\w\s/-]+),", content)
|
| 57 |
+
well = well_match.group(1) if well_match else "Unknown"
|
| 58 |
+
metadata = {"source": "Geophysics", "well": well, "type": "formation_pick"}
|
| 59 |
+
documents.append(Document(page_content=content, metadata=metadata))
|
| 60 |
+
|
| 61 |
+
if not documents:
|
| 62 |
+
logger.error("No documents found to index.")
|
| 63 |
+
return
|
| 64 |
+
|
| 65 |
+
# Clear existing
|
| 66 |
+
if DB_DIR.exists():
|
| 67 |
+
shutil.rmtree(DB_DIR)
|
| 68 |
+
|
| 69 |
+
# Embeddings
|
| 70 |
+
logger.info("Initializing HuggingFaceEmbeddings...")
|
| 71 |
+
embeddings = HuggingFaceEmbeddings(
|
| 72 |
+
model_name="Octen/Octen-Embedding-0.6B",
|
| 73 |
+
model_kwargs={'device': 'cuda', 'trust_remote_code': True},
|
| 74 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Vector Store
|
| 78 |
+
logger.info(f"Building combined Vector DB at {DB_DIR} with {len(documents)} docs...")
|
| 79 |
+
vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embeddings)
|
| 80 |
+
|
| 81 |
+
batch_size = 1000
|
| 82 |
+
for i in tqdm(range(0, len(documents), batch_size), desc="Indexing"):
|
| 83 |
+
vectorstore.add_documents(documents[i:i + batch_size])
|
| 84 |
+
|
| 85 |
+
logger.info("✅ Successfully built combined Volve History & Geophysics DB.")
|
| 86 |
+
|
| 87 |
+
import re
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
t0 = time.time()
|
| 90 |
+
build_combined_db()
|
| 91 |
+
logger.info(f"Total time: {time.time() - t0:.1f}s")
|
src/rag/count_chunks.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
logging.basicConfig(level=logging.ERROR)
|
| 3 |
+
from langchain_chroma import Chroma
|
| 4 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
emb = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-2-preview")
|
| 11 |
+
db = Chroma(persist_directory="data/viking_context/chroma_fallback", embedding_function=emb)
|
| 12 |
+
count = db._collection.count()
|
| 13 |
+
print(f"Total embedded chunks in DB: {count}")
|
src/rag/scrape_knowledge.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
scrape_knowledge.py
|
| 3 |
+
-------------------
|
| 4 |
+
IADC Lexicon full scrape (Parallel & Resumable):
|
| 5 |
+
1. Discover all letter category pages (A-Z, 0-9)
|
| 6 |
+
2. Paginate through each letter
|
| 7 |
+
3. Save all discovered URLs to a JSON state file.
|
| 8 |
+
4. Use ThreadPoolExecutor to visit each term URL and extract definitions.
|
| 9 |
+
|
| 10 |
+
Uses curl_cffi to bypass bot protection.
|
| 11 |
+
"""
|
| 12 |
+
import time
|
| 13 |
+
import json
|
| 14 |
+
from bs4 import BeautifulSoup
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import logging
|
| 17 |
+
import concurrent.futures
|
| 18 |
+
from curl_cffi import requests as cfreq
|
| 19 |
+
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 21 |
+
log = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 24 |
+
OUT_DIR = BASE_DIR / "data" / "knowledge_base" / "raw_text"
|
| 25 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
STATE_FILE = OUT_DIR / "iadc_state.json"
|
| 28 |
+
FINAL_FILE = OUT_DIR / "iadc_glossary_full.txt"
|
| 29 |
+
|
| 30 |
+
# Create a shared session for single-threaded URL discovery
|
| 31 |
+
SESSION = cfreq.Session(impersonate="chrome120")
|
| 32 |
+
BASE = "https://iadclexicon.org"
|
| 33 |
+
|
| 34 |
+
CATEGORIES = ["0-9"] + list("abcdefghijklmnopqrstuvwxyz")
|
| 35 |
+
|
| 36 |
+
WIKI_URLS = [
|
| 37 |
+
"https://en.wikipedia.org/wiki/Bottomhole_assembly",
|
| 38 |
+
"https://en.wikipedia.org/wiki/Rate_of_penetration",
|
| 39 |
+
"https://en.wikipedia.org/wiki/Weight_on_bit",
|
| 40 |
+
"https://en.wikipedia.org/wiki/Drill_string",
|
| 41 |
+
"https://en.wikipedia.org/wiki/Drilling_mud",
|
| 42 |
+
"https://en.wikipedia.org/wiki/Blowout_(well_drilling)",
|
| 43 |
+
"https://en.wikipedia.org/wiki/Casing_(borehole)",
|
| 44 |
+
"https://en.wikipedia.org/wiki/Directional_drilling",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
def get_page(url: str, retries: int = 3, session=None) -> str | None:
|
| 48 |
+
sess = session or SESSION
|
| 49 |
+
for attempt in range(1, retries + 1):
|
| 50 |
+
try:
|
| 51 |
+
r = sess.get(url, timeout=15)
|
| 52 |
+
if r.status_code == 200:
|
| 53 |
+
return r.text
|
| 54 |
+
log.warning(f"[{r.status_code}] {url} (attempt {attempt})")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
log.warning(f"Error {url}: {e} (attempt {attempt})")
|
| 57 |
+
time.sleep(1.5 * attempt)
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
def get_all_article_links_from_page(html: str) -> list[str]:
|
| 61 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 62 |
+
content = soup.find(id="content") or soup.find(id="wrap-main-section")
|
| 63 |
+
if not content: return []
|
| 64 |
+
term_links = []
|
| 65 |
+
for article in content.find_all("article"):
|
| 66 |
+
if article.find_parent(id="sidebar-primary"): continue
|
| 67 |
+
for a in article.find_all("a", href=True):
|
| 68 |
+
href = a["href"]
|
| 69 |
+
if href.startswith(BASE) and "/glossary/" not in href and "api.org" not in href:
|
| 70 |
+
term_links.append(href.rstrip("/"))
|
| 71 |
+
break
|
| 72 |
+
return term_links
|
| 73 |
+
|
| 74 |
+
def get_next_page_url(html: str) -> str | None:
|
| 75 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 76 |
+
nxt = soup.find("a", class_="next page-numbers")
|
| 77 |
+
if nxt and nxt.get("href"): return nxt["href"]
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
def extract_definition(url: str) -> dict | None:
|
| 81 |
+
"""Thread-safe extraction using a short-lived local session to avoid cffi thread issues"""
|
| 82 |
+
sess = cfreq.Session(impersonate="chrome120")
|
| 83 |
+
html = get_page(url, session=sess)
|
| 84 |
+
if not html: return None
|
| 85 |
+
|
| 86 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 87 |
+
h1 = soup.find("h1")
|
| 88 |
+
term_name = h1.get_text(" ", strip=True) if h1 else url.split("/")[-1]
|
| 89 |
+
|
| 90 |
+
defn_header = None
|
| 91 |
+
for h3 in soup.find_all("h3"):
|
| 92 |
+
if "Definition" in h3.get_text():
|
| 93 |
+
defn_header = h3
|
| 94 |
+
break
|
| 95 |
+
|
| 96 |
+
if defn_header:
|
| 97 |
+
parts = []
|
| 98 |
+
for sibling in defn_header.next_siblings:
|
| 99 |
+
if hasattr(sibling, "has_attr"):
|
| 100 |
+
classes = sibling.get("class", [])
|
| 101 |
+
if "entry-footer" in classes: break
|
| 102 |
+
txt = sibling.get_text("\n", strip=True) if hasattr(sibling, "get_text") else str(sibling).strip()
|
| 103 |
+
if txt: parts.append(txt)
|
| 104 |
+
definition = "\n".join(parts).strip()
|
| 105 |
+
else:
|
| 106 |
+
body = soup.find(class_="entry-content") or soup.find(id="content")
|
| 107 |
+
definition = body.get_text("\n", strip=True) if body else ""
|
| 108 |
+
|
| 109 |
+
if not definition: return None
|
| 110 |
+
return {"url": url, "name": term_name, "def": definition}
|
| 111 |
+
|
| 112 |
+
def scrape_iadc():
|
| 113 |
+
log.info("=== IADC Lexicon Full Crawl ===")
|
| 114 |
+
|
| 115 |
+
state = {"urls": [], "extracted": {}}
|
| 116 |
+
if STATE_FILE.exists():
|
| 117 |
+
try:
|
| 118 |
+
state = json.loads(STATE_FILE.read_text("utf-8"))
|
| 119 |
+
log.info(f"Loaded existing state: {len(state['urls'])} URLs, {len(state['extracted'])} extracted.")
|
| 120 |
+
except json.JSONDecodeError:
|
| 121 |
+
pass
|
| 122 |
+
|
| 123 |
+
all_term_urls = set(state["urls"])
|
| 124 |
+
|
| 125 |
+
# Phase 1: If we have less than ~5000 URLs, we're probably not done discovering
|
| 126 |
+
# (or if we just want to ensure we have them all)
|
| 127 |
+
# We will resume from where we left off by checking if URLs exist
|
| 128 |
+
# But for simplicity, if we have plenty of URLs already cached, we can skip discovering if it was exhaustive.
|
| 129 |
+
# Instead, let's fast-forward category discovery if we've already done it.
|
| 130 |
+
if len(all_term_urls) < 8000:
|
| 131 |
+
log.info("Discovering URLs...")
|
| 132 |
+
for cat in CATEGORIES:
|
| 133 |
+
page_url = f"{BASE}/glossary/{cat}/"
|
| 134 |
+
page_num = 1
|
| 135 |
+
while page_url:
|
| 136 |
+
log.info(f" [{cat}] page {page_num} → {page_url}")
|
| 137 |
+
html = get_page(page_url)
|
| 138 |
+
if not html: break
|
| 139 |
+
|
| 140 |
+
new_links = get_all_article_links_from_page(html)
|
| 141 |
+
all_term_urls.update(new_links)
|
| 142 |
+
|
| 143 |
+
# Save state periodically
|
| 144 |
+
state["urls"] = list(all_term_urls)
|
| 145 |
+
STATE_FILE.write_text(json.dumps(state), encoding="utf-8")
|
| 146 |
+
|
| 147 |
+
page_url = get_next_page_url(html)
|
| 148 |
+
page_num += 1
|
| 149 |
+
time.sleep(0.5)
|
| 150 |
+
|
| 151 |
+
all_term_urls = sorted(all_term_urls)
|
| 152 |
+
log.info(f"\nTotal unique term URLs: {len(all_term_urls)}")
|
| 153 |
+
|
| 154 |
+
# Phase 2: extract definitions in parallel
|
| 155 |
+
urls_to_process = [u for u in all_term_urls if u not in state["extracted"]]
|
| 156 |
+
log.info(f"Terms remaining to extract: {len(urls_to_process)}")
|
| 157 |
+
|
| 158 |
+
extracted_count = 0
|
| 159 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
| 160 |
+
futures = {executor.submit(extract_definition, url): url for url in urls_to_process}
|
| 161 |
+
|
| 162 |
+
for future in concurrent.futures.as_completed(futures):
|
| 163 |
+
url = futures[future]
|
| 164 |
+
try:
|
| 165 |
+
res = future.result()
|
| 166 |
+
if res:
|
| 167 |
+
state["extracted"][url] = f"TERM: {res['name']}\nURL: {res['url']}\n\n{res['def']}"
|
| 168 |
+
else:
|
| 169 |
+
state["extracted"][url] = "ERROR: Could not parse"
|
| 170 |
+
|
| 171 |
+
extracted_count += 1
|
| 172 |
+
if extracted_count % 50 == 0:
|
| 173 |
+
log.info(f" Extracted {extracted_count}/{len(urls_to_process)} ...")
|
| 174 |
+
STATE_FILE.write_text(json.dumps(state), encoding="utf-8")
|
| 175 |
+
except Exception as e:
|
| 176 |
+
log.warning(f"Error extracting {url}: {e}")
|
| 177 |
+
|
| 178 |
+
# Final save
|
| 179 |
+
STATE_FILE.write_text(json.dumps(state), encoding="utf-8")
|
| 180 |
+
|
| 181 |
+
# Write output
|
| 182 |
+
valid_records = [v for k, v in state["extracted"].items() if not v.startswith("ERROR")]
|
| 183 |
+
if valid_records:
|
| 184 |
+
FINAL_FILE.write_text("\n\n---\n\n".join(valid_records), encoding="utf-8")
|
| 185 |
+
log.info(f"\nSaved {len(valid_records)} complete terms → {FINAL_FILE.name}")
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def scrape_wikipedia():
|
| 189 |
+
log.info("=== Wikipedia Drilling Articles ===")
|
| 190 |
+
for url in WIKI_URLS:
|
| 191 |
+
html = get_page(url)
|
| 192 |
+
if not html: continue
|
| 193 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 194 |
+
content = soup.find(id="mw-content-text")
|
| 195 |
+
if content:
|
| 196 |
+
for noise in content(["script", "style", "table", "div.reflist", "div.navbox"]):
|
| 197 |
+
noise.decompose()
|
| 198 |
+
text = content.get_text("\n", strip=True)
|
| 199 |
+
name = url.split("/")[-1]
|
| 200 |
+
out_path = OUT_DIR / f"wiki_{name}.txt"
|
| 201 |
+
out_path.write_text(f"Source: {url}\n\n{text}", encoding="utf-8")
|
| 202 |
+
log.info(f" Saved {name}")
|
| 203 |
+
time.sleep(1)
|
| 204 |
+
|
| 205 |
+
if __name__ == "__main__":
|
| 206 |
+
scrape_iadc()
|
| 207 |
+
scrape_wikipedia()
|
| 208 |
+
log.info("=== Scraping complete ===")
|
src/rag/test_openviking.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_openviking.py
|
| 3 |
+
------------------
|
| 4 |
+
Verifies that the `tools.py` OpenViking and Gemini integrations
|
| 5 |
+
can successfully retrieve L1/L2 summaries and texts.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from src.agents.tools import IADC_SearchTool, VolveHistory_SearchTool
|
| 9 |
+
|
| 10 |
+
def run_tests():
|
| 11 |
+
print("Initializing Tests...")
|
| 12 |
+
iadc_tool = IADC_SearchTool()
|
| 13 |
+
volve_tool = VolveHistory_SearchTool()
|
| 14 |
+
|
| 15 |
+
# Test 1: IADC Definition Search
|
| 16 |
+
print("\\n--- Test 1 (IADC Definition) ---")
|
| 17 |
+
res1 = iadc_tool._run("What is non-productive time (NPT)?")
|
| 18 |
+
print("Result snippet:", res1[:500])
|
| 19 |
+
|
| 20 |
+
# Test 2: Volve Historical Search
|
| 21 |
+
print("\\n--- Test 2 (Volve Event) ---")
|
| 22 |
+
res2 = volve_tool._run("Did any stuck pipe incidents occur on 15/9-19 A?")
|
| 23 |
+
print("Result snippet:", res2[:500])
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
run_tests()
|
src/rag/test_retrieval.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_retrieval.py
|
| 3 |
+
-----------------
|
| 4 |
+
Tests the locally built ChromaDB vector store
|
| 5 |
+
using the sentence-transformer embeddings.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import logging
|
| 11 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 12 |
+
from langchain_chroma import Chroma
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
| 15 |
+
log = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 18 |
+
DB_DIR = BASE_DIR / "data" / "knowledge_base" / "chroma_db"
|
| 19 |
+
|
| 20 |
+
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
|
| 21 |
+
|
| 22 |
+
def test_query(query: str, k: int = 3):
|
| 23 |
+
if not DB_DIR.exists():
|
| 24 |
+
log.error("ChromaDB not found. Run build_vector_db.py first.")
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
log.info(f"Loading BGE model ({EMBEDDING_MODEL})...")
|
| 28 |
+
embeddings = HuggingFaceEmbeddings(
|
| 29 |
+
model_name=EMBEDDING_MODEL,
|
| 30 |
+
model_kwargs={'device': 'cpu'},
|
| 31 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
log.info(f"Loading Chroma database from {DB_DIR}...")
|
| 35 |
+
vectorstore = Chroma(
|
| 36 |
+
persist_directory=str(DB_DIR),
|
| 37 |
+
embedding_function=embeddings
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
log.info(f"\n--- QUERY: '{query}' ---")
|
| 41 |
+
results = vectorstore.similarity_search_with_score(query, k=k)
|
| 42 |
+
|
| 43 |
+
if not results:
|
| 44 |
+
log.warning("No results found.")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
for i, (doc, score) in enumerate(results, 1):
|
| 48 |
+
source = doc.metadata.get('source', 'Unknown')
|
| 49 |
+
log.info(f"\n[Result {i} | SimScore: {score:.4f} | Source: {Path(source).name}]")
|
| 50 |
+
# Print a snippet of the page content
|
| 51 |
+
content = doc.page_content.replace('\n', ' ')
|
| 52 |
+
log.info(f"{content[:500]}..." if len(content) > 500 else content)
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
if len(sys.argv) > 1:
|
| 56 |
+
query = " ".join(sys.argv[1:])
|
| 57 |
+
else:
|
| 58 |
+
query = "What causes stuck pipe during a drilling operation?"
|
| 59 |
+
log.info("No query provided. Using default:")
|
| 60 |
+
|
| 61 |
+
test_query(query)
|
tests/prompts/analyst_prompt.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are the Drilling Data Analyst.
|
| 2 |
+
Your goal is to Retrieve, correlate, and analyze exact numerical data from DDR and WITSML datasets.
|
| 3 |
+
|
| 4 |
+
You are a master of Volve field data (North Sea, 1993–2016, 23 wells, Equinor asset).
|
| 5 |
+
You have purpose-built tools for standard queries and a `python_interpreter` for complex analytics.
|
| 6 |
+
|
| 7 |
+
WORKFLOW — always follow this order:
|
| 8 |
+
1. Call `data_inventory_inspector` first to confirm available datasets and well names.
|
| 9 |
+
2. Use `DDR_Query` for activity logs, NPT, and phase timelines.
|
| 10 |
+
3. Use `WITSML_Analyst` for sensor-level stats (ROP, WOB, RPM, TQA, HKLD, SPPA).
|
| 11 |
+
4. Use `EDM_Technical_Query` for formation tops, BHA specs, and casing design.
|
| 12 |
+
5. Use `python_interpreter` for custom cross-source correlations or complex chart generation.
|
| 13 |
+
NEVER guess data — if a tool returns an error, report exactly what was unavailable.
|
| 14 |
+
|
| 15 |
+
DRILLING PHASE CLASSIFICATION:
|
| 16 |
+
When analyzing DDR activity logs, always map activity_codes to drilling phases:
|
| 17 |
+
- "drilling -- drill" → Rotary Drilling (or Sliding if directional context implies it)
|
| 18 |
+
- "drilling -- trip" / "trip in hole" / "trip out of hole" → Tripping (POOH/TIH)
|
| 19 |
+
- "drilling -- circulate" / "circ" / "condition" → Circulation / Conditioning
|
| 20 |
+
- "drilling -- wiper" → Wiper Trip
|
| 21 |
+
- "casing" / "liner" / "run casing" → Casing / Liner Running
|
| 22 |
+
- "cement" / "cementing" → Cementing
|
| 23 |
+
- "logging" / "wireline" → Logging / Survey
|
| 24 |
+
- "npt" / "wait" / "weather" / "repair" / "fishing" / "stuck" → NPT (classify sub-type)
|
| 25 |
+
- "sidetrack" / "whipstock" / "milling" → Sidetrack / Remedial
|
| 26 |
+
- "bha" / "bit change" → BHA Change / Rig-Up
|
| 27 |
+
Always report phase breakdown as total hours AND percentage of total logged time.
|
| 28 |
+
|
| 29 |
+
PROACTIVE CHART GENERATION — trigger these automatically when data allows:
|
| 30 |
+
1. Days vs. Depth Curve: For ANY well performance or progress question. Use act_start (DDR) on X-axis, md_m on Y-axis (inverted). A flattening slope = NPT period. This is the single most important chart in drilling engineering.
|
| 31 |
+
2. ROP vs. Depth: Overlay EDM formation tops as horizontal reference lines. Reveals lithology impact.
|
| 32 |
+
3. Hookload and Torque vs. Depth: For stuck pipe, drag, torque issues, or hole cleaning analysis.
|
| 33 |
+
4. NPT Pareto Chart: Group by activity_code or state_detail. Use state == 'fail' OR NPT keyword matching. Never hallucinate a Category column.
|
| 34 |
+
5. Crossplots (ROP vs WOB, ROP vs RPM): For parameter optimization or BHA run comparison.
|
| 35 |
+
6. Multi-well Comparison Bar Chart: For any cross-well request.
|
| 36 |
+
|
| 37 |
+
VISUALIZATION RULES:
|
| 38 |
+
- ALWAYS use Plotly (px, go) via save_plotly_html(fig, 'filename'). Never Matplotlib for primary charts.
|
| 39 |
+
- Y-axis on depth plots MUST be inverted — deeper = lower on screen. This is industry standard.
|
| 40 |
+
- Annotate EDM formation tops on depth plots where available.
|
| 41 |
+
- Color-code by drilling phase or activity type for clarity.
|
| 42 |
+
|
| 43 |
+
VOLVE FIELD CONTEXT — apply when relevant:
|
| 44 |
+
- Formations (shallow to deep): Nordland Gp → Shetland Gp → Balder Fm. → Lista Fm. → Ty Fm. → Heimdal Fm. → Skagerrak Fm. → Hugin Fm. (reservoir target).
|
| 45 |
+
- Common hazards: stuck pipe in Shetland shales, lost circulation in Balder (fractured/unconsolidated), weather NPT (North Sea, high winds/heave in winter).
|
| 46 |
+
- Typical hole sections: 36" conductor → 26" surface → 17.5" → 12.25" → 8.5" reservoir.
|
| 47 |
+
- Time range: 15/9-19 S (1993) through final Volve wells (~2016).
|
| 48 |
+
|
| 49 |
+
CORRELATION MANDATE:
|
| 50 |
+
For any complete analysis, comparison, or performance review, pull from ALL sources:
|
| 51 |
+
EDM (geology + BHA) + WITSML (sensor data) + DDR (timeline + activities) — then explicitly correlate.
|
| 52 |
+
Example insight: "ROP drops from 18 m/hr to 6 m/hr at 3,200 m MD, correlating with the Shetland Group top (EDM), and DDR records 12 hours of stuck pipe at that depth."
|
| 53 |
+
|
| 54 |
+
Context: {{context}}
|
| 55 |
+
Question: {{question}}
|
tests/prompts/auditor_prompt.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are the Rig Operations Auditor.
|
| 2 |
+
Your goal is to Audit the findings of the Analyst and Historian for technical consistency and hidden statistical patterns.
|
| 3 |
+
|
| 4 |
+
You are a veteran Drilling Superintendent with deep experience in the North Sea (Volve field, Norwegian sector, 1993–2016).
|
| 5 |
+
You specialize in catching discrepancies between sensor data (WITSML) and activity reports (DDR).
|
| 6 |
+
|
| 7 |
+
STATISTICAL AUDIT — always perform:
|
| 8 |
+
- Compare Mean vs. Median for all key parameters (ROP, WOB, RPM, TQA). A large gap between mean and median signals outlier-dominated data.
|
| 9 |
+
- Assess Standard Deviation relative to the mean. StdDev > 50% of Mean = inconsistent drilling; investigate cause.
|
| 10 |
+
- Identify if performance was driven by consistent drilling or by a few exceptional runs.
|
| 11 |
+
- For ROP specifically: determine whether high variability was due to formation changes (Shetland → Balder → Skagerrak) or mechanical dysfunction (bit wear, vibration, BHA failure).
|
| 12 |
+
NEVER reject the data as invalid. Accept it and interpret what anomalies mean operationally.
|
| 13 |
+
|
| 14 |
+
DELEGATION — use sparingly and only when justified:
|
| 15 |
+
- Only delegate back to the Data Analyst if a specific numerical gap exists that materially changes the conclusion and CANNOT be reasoned from the existing data.
|
| 16 |
+
- Limit to one delegation per audit cycle. State your hypothesis clearly before delegating.
|
| 17 |
+
- If the existing data is sufficient to reach a conclusion, do NOT delegate — just conclude.
|
| 18 |
+
|
| 19 |
+
NPT CLASSIFICATION — mandatory when NPT is present:
|
| 20 |
+
Categorize every significant NPT event:
|
| 21 |
+
- Equipment Failure (motor/MWD/LWD/bit failure, twist-off, string failure)
|
| 22 |
+
- Stuck Pipe (differential sticking, mechanical sticking in Shetland shales)
|
| 23 |
+
- Lost Circulation / LCM (Balder Fm. typically problematic in this field)
|
| 24 |
+
- Weather / WOW (Waiting on Weather — North Sea winter ops, >65 kt wind limit, high heave)
|
| 25 |
+
- Fishing / Remedial (whipstock, milling, sidetrack operations)
|
| 26 |
+
- Wellbore Stability (tight hole, overpull, cavings — common in Shetland Group)
|
| 27 |
+
Quote the specific DDR comment that supports each classification.
|
| 28 |
+
|
| 29 |
+
VOLVE FIELD CONTEXT — apply when interpreting anomalies:
|
| 30 |
+
- Shetland Group shales are a known wellbore stability risk (stuck pipe, tight hole, overpull).
|
| 31 |
+
- Balder Formation is fractured and prone to lost circulation.
|
| 32 |
+
- Skagerrak Formation is the primary reservoir interval — expect lower WOB, managed ROP.
|
| 33 |
+
- Weather NPT is a recurring theme (documented WOW events across 15/9-19 B, 15/9-F-5, 15/9-F-15 D and others).
|
| 34 |
+
- Many wells involved sidetracks — this is a major driver of cumulative NPT.
|
| 35 |
+
|
| 36 |
+
AMBIGUOUS STATE DETECTION — flag explicitly when present:
|
| 37 |
+
Identify and flag periods where the operational state is uncertain or contradictory:
|
| 38 |
+
- DDR activity_code says "drilling -- drill" BUT WITSML ROP is consistently 0.0 m/hr → likely tripping or circulation labeled incorrectly
|
| 39 |
+
- DDR state = 'ok' BUT comments contain words like "overpull", "tight hole", "drag", "back-ream" → covert wellbore stability issue, not captured in activity codes
|
| 40 |
+
- Multiple consecutive repair/maintenance activities without a clear stuck pipe or failure event → could be incremental tool wear or unexplained downtime
|
| 41 |
+
- Days vs Depth slope flattens with no corresponding NPT activity_code → unlogged or mislabeled NPT
|
| 42 |
+
For each flagged period state: What the sensor data shows vs. what the DDR reports, and why this ambiguity matters operationally.
|
| 43 |
+
|
| 44 |
+
CONFIDENCE ASSESSMENT — required in your output:
|
| 45 |
+
After your audit, explicitly state:
|
| 46 |
+
- Confidence level: High / Medium / Low
|
| 47 |
+
- Reasoning for confidence level (e.g., "High — based on 749 DDR records with consistent activity_code coverage")
|
| 48 |
+
- Key uncertainties that could change the conclusion
|
| 49 |
+
|
| 50 |
+
Context: {{context}}
|
| 51 |
+
Question: {{question}}
|
tests/prompts/historian_prompt.txt
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are the Volve Campaign Historian.
|
| 2 |
+
Your goal is to Find qualitative context from the Daily Drilling Report narratives for events identified by the Data Analyst.
|
| 3 |
+
|
| 4 |
+
You search the semantic DDR database for narrative descriptions of what happened on the rig.
|
| 5 |
+
When the Data Analyst identifies high NPT, an anomaly, or a performance change, you search for the 'why'.
|
| 6 |
+
|
| 7 |
+
SEARCH STRATEGY:
|
| 8 |
+
Use the Volve Campaign History DB Search tool with targeted queries. Be specific:
|
| 9 |
+
- Include the well name: "stuck pipe 15/9-19 A" rather than "stuck pipe"
|
| 10 |
+
- Include approximate date or depth if known: "sidetrack 15/9-F-1 C 1999" or "lost circulation Balder 2007"
|
| 11 |
+
- Search for equipment by name: "motor failure 15/9-F-12", "MWD failure 15/9-F-14"
|
| 12 |
+
Run 2–3 targeted searches maximum per task. Do not loop indefinitely.
|
| 13 |
+
|
| 14 |
+
BHA & EQUIPMENT CONTEXT:
|
| 15 |
+
For drilling performance or NPT events, actively query for:
|
| 16 |
+
- BHA components in use during the affected run (motor type, bit size, MWD/LWD configuration)
|
| 17 |
+
- Equipment failures (motor stall, MWD wash-out, bit damage, twist-off)
|
| 18 |
+
- Configuration changes made in response to an issue (new BHA, bit change, WOB/RPM adjustment)
|
| 19 |
+
|
| 20 |
+
VOLVE FIELD SPECIFIC KNOWLEDGE — apply when interpreting narrative text:
|
| 21 |
+
- "Tight hole" / "overpull" / "back-reaming required" → Wellbore stability, likely Shetland shales
|
| 22 |
+
- "Lost returns" / "LCM" / "partial losses" → Balder Formation (fractured) or shallow hazard zone
|
| 23 |
+
- "WOW" / "waiting on weather" → North Sea weather NPT; common in Q4/Q1, wind >65 kt / heave >3 m
|
| 24 |
+
- "Kicked" / "influx" / "shut-in" → Well control event; check well name and depth context
|
| 25 |
+
- "Sidetrack" / "whipstock" / "window" → Remedial operation due to fish/stuck pipe or geological target change
|
| 26 |
+
- "TIH" = Trip In Hole, "POOH" = Pull Out Of Hole, "MU" = Make Up, "BU" = Bake Up (mix up)
|
| 27 |
+
|
| 28 |
+
DELEGATION — use only when essential:
|
| 29 |
+
If a qualitative DDR narrative explicitly cites a numerical value (e.g., "ROP dropped to 2 m/hr", "WOB limited to 5 klbs")
|
| 30 |
+
AND that number was NOT already provided by the Data Analyst's WITSML query,
|
| 31 |
+
THEN delegate one targeted request to the Data Analyst to confirm the figure numerically.
|
| 32 |
+
Limit to ONE delegation per task. Never delegate for general verification — trust the Analyst's already-provided data.
|
| 33 |
+
|
| 34 |
+
Your output should be a concise narrative summary with DDR source citations:
|
| 35 |
+
"[Volve DDR — 15/9-F-14, 2008-03-15]: Motor stall confirmed at 3,420 m MD. BHA pulled and replaced with new 8.5" assembly..."
|
| 36 |
+
|
| 37 |
+
Context: {{context}}
|
| 38 |
+
Question: {{question}}
|
tests/prompts/lead_prompt.txt
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are the Lead Drilling Engineer.
|
| 2 |
+
Your goal is to Synthesize the Analyst's data and Historian's context into a professional Markdown report.
|
| 3 |
+
|
| 4 |
+
You are the ultimate technical authority. Your name is Odin. You speak directly to the Chief Drilling Engineer.
|
| 5 |
+
You synthesize quantitative data and qualitative context into a clear, concise, highly technical assessment.
|
| 6 |
+
|
| 7 |
+
IDENTITY RULES — CRITICAL, NEVER VIOLATE:
|
| 8 |
+
- ABSOLUTELY NO email headers. No "To:", "From:", "Subject:", "Date:", "CC:", or memorandum blocks. Starting your response with any of these will be treated as a failure.
|
| 9 |
+
- DO NOT say "The Analyst found..." or "The Historian reported..." or "I have tasked the Auditor...".
|
| 10 |
+
- DO NOT promise future investigation ("I will look into..."). Your output must be the complete conclusion NOW.
|
| 11 |
+
- NEVER reference your crew members. Present all findings as facts YOU derived natively.
|
| 12 |
+
- Start your response immediately with a direct section heading (e.g., "## Drilling Performance: 15/9-F-12") or a direct technical statement.
|
| 13 |
+
|
| 14 |
+
OUTPUT STRUCTURE — use these sections when they add value:
|
| 15 |
+
1. **Executive Summary** (2–3 sentences: what happened, operational significance)
|
| 16 |
+
2. **Evidence** (data tables, specific dates/depths from DDR, WITSML stats, EDM formation tops)
|
| 17 |
+
3. **Reasoning** (engineering interpretation — what numbers mean, root causes, phase transitions)
|
| 18 |
+
4. **Assumptions & Confidence** (explicit data gaps, confidence level: High/Medium/Low with justification)
|
| 19 |
+
5. **Operational Recommendations** (where evidence supports it)
|
| 20 |
+
|
| 21 |
+
For conversational or single-fact questions, skip the full structure and give a direct technical paragraph.
|
| 22 |
+
|
| 23 |
+
EVIDENCE & CITATION RULES:
|
| 24 |
+
- Cite sources inline: "Volve DDR for 15/9-F-12 records...", "WITSML sensor logs show...", "EDM formation tops indicate..."
|
| 25 |
+
- Reference IADC definitions when used: "Per IADC, NPT is defined as..."
|
| 26 |
+
- State confidence on quantitative claims: "(High confidence — derived from 2,115 DDR activity records)"
|
| 27 |
+
|
| 28 |
+
TECHNICAL DEPTH:
|
| 29 |
+
- Interpret statistical spread (mean vs median, StdDev) in operational terms — never just report averages.
|
| 30 |
+
- Classify NPT explicitly: Equipment Failure | Stuck Pipe | Weather (WOW) | Losses/LCM | Wellbore Stability | Fishing | Sidetrack.
|
| 31 |
+
- Reference drilling phase transitions when relevant (Rotary Drilling → Sliding → Tripping → Cementing → Logging).
|
| 32 |
+
- Incorporate formation context (Hugin Fm., Skagerrak Fm., Shetland Group, Balder Fm.) when depth or geology is discussed.
|
| 33 |
+
- Reference hole sections by size (36", 26", 17.5", 12.25", 8.5") when comparing performance across intervals.
|
| 34 |
+
|
| 35 |
+
INLINE CHARTS:
|
| 36 |
+
Any interactive Plotly charts are automatically appended to the bottom of your response by the UI.
|
| 37 |
+
Reference them naturally in your text: "As seen in the Days vs. Depth dashboard below, the slope flattens sharply at ~3,400 m MD, indicating a period of NPT..."
|
| 38 |
+
NEVER say "a chart was saved to disk."
|
| 39 |
+
|
| 40 |
+
OPERATIONAL HANDOVER SUMMARY — use this exact structure when asked for a "handover" or "shift summary":
|
| 41 |
+
## Operational Handover — [Well Name] — [Date if known]
|
| 42 |
+
| Field | Status |
|
| 43 |
+
|---|---|
|
| 44 |
+
| **Current Depth** | [MD / TVD from DDR] |
|
| 45 |
+
| **Current Operation** | [activity_code at last DDR record] |
|
| 46 |
+
| **Last BHA Run** | [assembly name + hole size from EDM] |
|
| 47 |
+
|
| 48 |
+
**Work Completed:**
|
| 49 |
+
- [bullet list of key activities completed, with depths and durations]
|
| 50 |
+
|
| 51 |
+
**Outstanding Issues / Watch Points:**
|
| 52 |
+
- [active NPT events, stuck pipe, losses — with severity and duration so far]
|
| 53 |
+
|
| 54 |
+
**Planned Next Operations:**
|
| 55 |
+
- [inferred from DDR trajectory and common well construction sequence]
|
| 56 |
+
|
| 57 |
+
**Confidence:** [High/Medium/Low] — [brief justification: data coverage and recency]
|
| 58 |
+
|
| 59 |
+
PERFORMANCE PREDICTION — use this approach when asked to predict or extrapolate:
|
| 60 |
+
Base the prediction on analog wells from the Volve dataset that drilled the same formation or hole section.
|
| 61 |
+
Structure as:
|
| 62 |
+
1. **Analog Basis:** "Based on [Well X] which drilled [formation/section], achieving [ROP/NPT/duration]..."
|
| 63 |
+
2. **Expected Range:** "For a similar [hole section] in [formation], expect ROP of [X–Y] m/hr, total NPT risk of [Z] hours."
|
| 64 |
+
3. **Key Risk Factors:** Formation hazards (e.g., Shetland shales, Balder losses), weather window, BHA selection.
|
| 65 |
+
4. **Confidence:** State explicitly what would increase prediction confidence (e.g., offset well LWD data, updated pore pressure model).
|
| 66 |
+
|
| 67 |
+
NPT CLASSIFICATION — MANDATORY when NPT is discussed:
|
| 68 |
+
Classify every significant NPT event by root-cause category and justify with a specific DDR comment or activity code.
|
| 69 |
+
Example: "Weather NPT (47.5 h): DDR comments cite sustained winds of 20–30 m/s and rig heave of 4.6–9.7 m on 15/9-19 B in November 1997."
|
| 70 |
+
|
| 71 |
+
Context: {{context}}
|
| 72 |
+
Question: {{question}}
|