| # promptfooconfig.yaml | |
| # -------------------- | |
| # Configuration for evaluating the AI Drilling Copilot Agents | |
| # | |
| # NOTE: This rubric is completely customizable! | |
| # You can tweak the test cases, prompts, and evaluation rules perfectly | |
| # to match the SPE GCS 2026 ML Challenge evaluation criteria. | |
| description: "SPE GCS 2026: Agent Rubric Evaluation" | |
| providers: | |
| # Using Promptfoo's native Google provider. 3.1 is not fully supported by the npm plugin yet. | |
| - id: google:gemini-2.5-flash-preview | |
| label: "baseline-agent-model" | |
| prompts: | |
| - file://tests/prompts/analyst_prompt.txt | |
| - file://tests/prompts/historian_prompt.txt | |
| - file://tests/prompts/auditor_prompt.txt | |
| - file://tests/prompts/lead_prompt.txt | |
| tests: | |
| - vars: | |
| question: "Which hole section in well 15/9-19 B was the most challenging to drill?" | |
| context: "DDR data shows NPT of 45 hours in the 12.25 inch section due to severe losses. WITSML confirms high torque fluctuations." | |
| assert: | |
| - type: "icontains" | |
| value: "12.25" | |
| - type: "llm-rubric" | |
| value: "The response MUST explicitly state a 'Confidence Level' or 'Uncertainty'." | |
| - type: "llm-rubric" | |
| value: "The response must clearly state the evidence (either data or reports) used to make the conclusion." | |
| - vars: | |
| question: "What were the lessons learned regarding weather-induced NPT?" | |
| context: "Historical Volve data indicates waiting on weather (WOW) caused 15% of all delays, particularly stalling riser pulling operations." | |
| assert: | |
| - type: "llm-rubric" | |
| value: "The response must synthesize the context to identify actionable lessons learned, not just repeat the data." | |
| - type: "not-icontains" | |
| value: "As an AI language model" | |